]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2014 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <linux/seccomp.h>
23 #include <seccomp.h>
24 #include <stddef.h>
25 #include <sys/mman.h>
26 #include <sys/prctl.h>
27 #include <sys/shm.h>
28
29 #include "af-list.h"
30 #include "alloc-util.h"
31 #include "macro.h"
32 #include "nsflags.h"
33 #include "process-util.h"
34 #include "seccomp-util.h"
35 #include "set.h"
36 #include "string-util.h"
37 #include "strv.h"
38 #include "util.h"
39 #include "errno-list.h"
40
41 const uint32_t seccomp_local_archs[] = {
42
43 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
44
45 #if defined(__x86_64__) && defined(__ILP32__)
46 SCMP_ARCH_X86,
47 SCMP_ARCH_X86_64,
48 SCMP_ARCH_X32, /* native */
49 #elif defined(__x86_64__) && !defined(__ILP32__)
50 SCMP_ARCH_X86,
51 SCMP_ARCH_X32,
52 SCMP_ARCH_X86_64, /* native */
53 #elif defined(__i386__)
54 SCMP_ARCH_X86,
55 #elif defined(__aarch64__)
56 SCMP_ARCH_ARM,
57 SCMP_ARCH_AARCH64, /* native */
58 #elif defined(__arm__)
59 SCMP_ARCH_ARM,
60 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
61 SCMP_ARCH_MIPSEL,
62 SCMP_ARCH_MIPS, /* native */
63 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL, /* native */
66 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
67 SCMP_ARCH_MIPSEL,
68 SCMP_ARCH_MIPS,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64N32,
71 SCMP_ARCH_MIPSEL64,
72 SCMP_ARCH_MIPS64, /* native */
73 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
74 SCMP_ARCH_MIPS,
75 SCMP_ARCH_MIPSEL,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32,
78 SCMP_ARCH_MIPS64,
79 SCMP_ARCH_MIPSEL64, /* native */
80 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
81 SCMP_ARCH_MIPSEL,
82 SCMP_ARCH_MIPS,
83 SCMP_ARCH_MIPSEL64,
84 SCMP_ARCH_MIPS64,
85 SCMP_ARCH_MIPSEL64N32,
86 SCMP_ARCH_MIPS64N32, /* native */
87 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
88 SCMP_ARCH_MIPS,
89 SCMP_ARCH_MIPSEL,
90 SCMP_ARCH_MIPS64,
91 SCMP_ARCH_MIPSEL64,
92 SCMP_ARCH_MIPS64N32,
93 SCMP_ARCH_MIPSEL64N32, /* native */
94 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
95 SCMP_ARCH_PPC,
96 SCMP_ARCH_PPC64LE,
97 SCMP_ARCH_PPC64, /* native */
98 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
99 SCMP_ARCH_PPC,
100 SCMP_ARCH_PPC64,
101 SCMP_ARCH_PPC64LE, /* native */
102 #elif defined(__powerpc__)
103 SCMP_ARCH_PPC,
104 #elif defined(__s390x__)
105 SCMP_ARCH_S390,
106 SCMP_ARCH_S390X, /* native */
107 #elif defined(__s390__)
108 SCMP_ARCH_S390,
109 #endif
110 (uint32_t) -1
111 };
112
113 const char* seccomp_arch_to_string(uint32_t c) {
114 /* Maintain order used in <seccomp.h>.
115 *
116 * Names used here should be the same as those used for ConditionArchitecture=,
117 * except for "subarchitectures" like x32. */
118
119 switch(c) {
120 case SCMP_ARCH_NATIVE:
121 return "native";
122 case SCMP_ARCH_X86:
123 return "x86";
124 case SCMP_ARCH_X86_64:
125 return "x86-64";
126 case SCMP_ARCH_X32:
127 return "x32";
128 case SCMP_ARCH_ARM:
129 return "arm";
130 case SCMP_ARCH_AARCH64:
131 return "arm64";
132 case SCMP_ARCH_MIPS:
133 return "mips";
134 case SCMP_ARCH_MIPS64:
135 return "mips64";
136 case SCMP_ARCH_MIPS64N32:
137 return "mips64-n32";
138 case SCMP_ARCH_MIPSEL:
139 return "mips-le";
140 case SCMP_ARCH_MIPSEL64:
141 return "mips64-le";
142 case SCMP_ARCH_MIPSEL64N32:
143 return "mips64-le-n32";
144 case SCMP_ARCH_PPC:
145 return "ppc";
146 case SCMP_ARCH_PPC64:
147 return "ppc64";
148 case SCMP_ARCH_PPC64LE:
149 return "ppc64-le";
150 case SCMP_ARCH_S390:
151 return "s390";
152 case SCMP_ARCH_S390X:
153 return "s390x";
154 default:
155 return NULL;
156 }
157 }
158
159 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
160 if (!n)
161 return -EINVAL;
162
163 assert(ret);
164
165 if (streq(n, "native"))
166 *ret = SCMP_ARCH_NATIVE;
167 else if (streq(n, "x86"))
168 *ret = SCMP_ARCH_X86;
169 else if (streq(n, "x86-64"))
170 *ret = SCMP_ARCH_X86_64;
171 else if (streq(n, "x32"))
172 *ret = SCMP_ARCH_X32;
173 else if (streq(n, "arm"))
174 *ret = SCMP_ARCH_ARM;
175 else if (streq(n, "arm64"))
176 *ret = SCMP_ARCH_AARCH64;
177 else if (streq(n, "mips"))
178 *ret = SCMP_ARCH_MIPS;
179 else if (streq(n, "mips64"))
180 *ret = SCMP_ARCH_MIPS64;
181 else if (streq(n, "mips64-n32"))
182 *ret = SCMP_ARCH_MIPS64N32;
183 else if (streq(n, "mips-le"))
184 *ret = SCMP_ARCH_MIPSEL;
185 else if (streq(n, "mips64-le"))
186 *ret = SCMP_ARCH_MIPSEL64;
187 else if (streq(n, "mips64-le-n32"))
188 *ret = SCMP_ARCH_MIPSEL64N32;
189 else if (streq(n, "ppc"))
190 *ret = SCMP_ARCH_PPC;
191 else if (streq(n, "ppc64"))
192 *ret = SCMP_ARCH_PPC64;
193 else if (streq(n, "ppc64-le"))
194 *ret = SCMP_ARCH_PPC64LE;
195 else if (streq(n, "s390"))
196 *ret = SCMP_ARCH_S390;
197 else if (streq(n, "s390x"))
198 *ret = SCMP_ARCH_S390X;
199 else
200 return -EINVAL;
201
202 return 0;
203 }
204
205 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
206 scmp_filter_ctx seccomp;
207 int r;
208
209 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
210 * any others. Also, turns off the NNP fiddling. */
211
212 seccomp = seccomp_init(default_action);
213 if (!seccomp)
214 return -ENOMEM;
215
216 if (arch != SCMP_ARCH_NATIVE &&
217 arch != seccomp_arch_native()) {
218
219 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_arch_add(seccomp, arch);
224 if (r < 0)
225 goto finish;
226
227 assert(seccomp_arch_exist(seccomp, arch) >= 0);
228 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
229 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
230 } else {
231 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
232 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
233 }
234
235 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
236 if (r < 0)
237 goto finish;
238
239 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
240 if (r < 0)
241 goto finish;
242
243 *ret = seccomp;
244 return 0;
245
246 finish:
247 seccomp_release(seccomp);
248 return r;
249 }
250
251 static bool is_basic_seccomp_available(void) {
252 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
253 }
254
255 static bool is_seccomp_filter_available(void) {
256 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
257 errno == EFAULT;
258 }
259
260 bool is_seccomp_available(void) {
261 static int cached_enabled = -1;
262
263 if (cached_enabled < 0)
264 cached_enabled =
265 is_basic_seccomp_available() &&
266 is_seccomp_filter_available();
267
268 return cached_enabled;
269 }
270
271 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
272 [SYSCALL_FILTER_SET_DEFAULT] = {
273 .name = "@default",
274 .help = "System calls that are always permitted",
275 .value =
276 "clock_getres\0"
277 "clock_gettime\0"
278 "clock_nanosleep\0"
279 "execve\0"
280 "exit\0"
281 "exit_group\0"
282 "futex\0"
283 "get_robust_list\0"
284 "get_thread_area\0"
285 "getegid\0"
286 "getegid32\0"
287 "geteuid\0"
288 "geteuid32\0"
289 "getgid\0"
290 "getgid32\0"
291 "getgroups\0"
292 "getgroups32\0"
293 "getpgid\0"
294 "getpgrp\0"
295 "getpid\0"
296 "getppid\0"
297 "getresgid\0"
298 "getresgid32\0"
299 "getresuid\0"
300 "getresuid32\0"
301 "getrlimit\0" /* make sure processes can query stack size and such */
302 "getsid\0"
303 "gettid\0"
304 "gettimeofday\0"
305 "getuid\0"
306 "getuid32\0"
307 "membarrier\0"
308 "nanosleep\0"
309 "pause\0"
310 "prlimit64\0"
311 "restart_syscall\0"
312 "rt_sigreturn\0"
313 "sched_yield\0"
314 "set_robust_list\0"
315 "set_thread_area\0"
316 "set_tid_address\0"
317 "set_tls\0"
318 "sigreturn\0"
319 "time\0"
320 "ugetrlimit\0"
321 },
322 [SYSCALL_FILTER_SET_AIO] = {
323 .name = "@aio",
324 .help = "Asynchronous IO",
325 .value =
326 "io_cancel\0"
327 "io_destroy\0"
328 "io_getevents\0"
329 "io_setup\0"
330 "io_submit\0"
331 },
332 [SYSCALL_FILTER_SET_BASIC_IO] = {
333 .name = "@basic-io",
334 .help = "Basic IO",
335 .value =
336 "_llseek\0"
337 "close\0"
338 "dup\0"
339 "dup2\0"
340 "dup3\0"
341 "lseek\0"
342 "pread64\0"
343 "preadv\0"
344 "preadv2\0"
345 "pwrite64\0"
346 "pwritev\0"
347 "pwritev2\0"
348 "read\0"
349 "readv\0"
350 "write\0"
351 "writev\0"
352 },
353 [SYSCALL_FILTER_SET_CHOWN] = {
354 .name = "@chown",
355 .help = "Change ownership of files and directories",
356 .value =
357 "chown\0"
358 "chown32\0"
359 "fchown\0"
360 "fchown32\0"
361 "fchownat\0"
362 "lchown\0"
363 "lchown32\0"
364 },
365 [SYSCALL_FILTER_SET_CLOCK] = {
366 .name = "@clock",
367 .help = "Change the system time",
368 .value =
369 "adjtimex\0"
370 "clock_adjtime\0"
371 "clock_settime\0"
372 "settimeofday\0"
373 "stime\0"
374 },
375 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
376 .name = "@cpu-emulation",
377 .help = "System calls for CPU emulation functionality",
378 .value =
379 "modify_ldt\0"
380 "subpage_prot\0"
381 "switch_endian\0"
382 "vm86\0"
383 "vm86old\0"
384 },
385 [SYSCALL_FILTER_SET_DEBUG] = {
386 .name = "@debug",
387 .help = "Debugging, performance monitoring and tracing functionality",
388 .value =
389 "lookup_dcookie\0"
390 "perf_event_open\0"
391 "process_vm_readv\0"
392 "process_vm_writev\0"
393 "ptrace\0"
394 "rtas\0"
395 #ifdef __NR_s390_runtime_instr
396 "s390_runtime_instr\0"
397 #endif
398 "sys_debug_setcontext\0"
399 },
400 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
401 .name = "@file-system",
402 .help = "File system operations",
403 .value =
404 "access\0"
405 "chdir\0"
406 "chmod\0"
407 "close\0"
408 "creat\0"
409 "faccessat\0"
410 "fallocate\0"
411 "fchdir\0"
412 "fchmod\0"
413 "fchmodat\0"
414 "fcntl\0"
415 "fcntl64\0"
416 "fgetxattr\0"
417 "flistxattr\0"
418 "fremovexattr\0"
419 "fsetxattr\0"
420 "fstat\0"
421 "fstat64\0"
422 "fstatat64\0"
423 "fstatfs\0"
424 "fstatfs64\0"
425 "ftruncate\0"
426 "ftruncate64\0"
427 "futimesat\0"
428 "getcwd\0"
429 "getdents\0"
430 "getdents64\0"
431 "getxattr\0"
432 "inotify_add_watch\0"
433 "inotify_init\0"
434 "inotify_init1\0"
435 "inotify_rm_watch\0"
436 "lgetxattr\0"
437 "link\0"
438 "linkat\0"
439 "listxattr\0"
440 "llistxattr\0"
441 "lremovexattr\0"
442 "lsetxattr\0"
443 "lstat\0"
444 "lstat64\0"
445 "mkdir\0"
446 "mkdirat\0"
447 "mknod\0"
448 "mknodat\0"
449 "mmap\0"
450 "mmap2\0"
451 "munmap\0"
452 "newfstatat\0"
453 "oldfstat\0"
454 "oldlstat\0"
455 "oldstat\0"
456 "open\0"
457 "openat\0"
458 "readlink\0"
459 "readlinkat\0"
460 "removexattr\0"
461 "rename\0"
462 "renameat\0"
463 "renameat2\0"
464 "rmdir\0"
465 "setxattr\0"
466 "stat\0"
467 "stat64\0"
468 "statfs\0"
469 "statfs64\0"
470 #ifdef __NR_statx
471 "statx\0"
472 #endif
473 "symlink\0"
474 "symlinkat\0"
475 "truncate\0"
476 "truncate64\0"
477 "unlink\0"
478 "unlinkat\0"
479 "utime\0"
480 "utimensat\0"
481 "utimes\0"
482 },
483 [SYSCALL_FILTER_SET_IO_EVENT] = {
484 .name = "@io-event",
485 .help = "Event loop system calls",
486 .value =
487 "_newselect\0"
488 "epoll_create\0"
489 "epoll_create1\0"
490 "epoll_ctl\0"
491 "epoll_ctl_old\0"
492 "epoll_pwait\0"
493 "epoll_wait\0"
494 "epoll_wait_old\0"
495 "eventfd\0"
496 "eventfd2\0"
497 "poll\0"
498 "ppoll\0"
499 "pselect6\0"
500 "select\0"
501 },
502 [SYSCALL_FILTER_SET_IPC] = {
503 .name = "@ipc",
504 .help = "SysV IPC, POSIX Message Queues or other IPC",
505 .value =
506 "ipc\0"
507 "memfd_create\0"
508 "mq_getsetattr\0"
509 "mq_notify\0"
510 "mq_open\0"
511 "mq_timedreceive\0"
512 "mq_timedsend\0"
513 "mq_unlink\0"
514 "msgctl\0"
515 "msgget\0"
516 "msgrcv\0"
517 "msgsnd\0"
518 "pipe\0"
519 "pipe2\0"
520 "process_vm_readv\0"
521 "process_vm_writev\0"
522 "semctl\0"
523 "semget\0"
524 "semop\0"
525 "semtimedop\0"
526 "shmat\0"
527 "shmctl\0"
528 "shmdt\0"
529 "shmget\0"
530 },
531 [SYSCALL_FILTER_SET_KEYRING] = {
532 .name = "@keyring",
533 .help = "Kernel keyring access",
534 .value =
535 "add_key\0"
536 "keyctl\0"
537 "request_key\0"
538 },
539 [SYSCALL_FILTER_SET_MEMLOCK] = {
540 .name = "@memlock",
541 .help = "Memory locking control",
542 .value =
543 "mlock\0"
544 "mlock2\0"
545 "mlockall\0"
546 "munlock\0"
547 "munlockall\0"
548 },
549 [SYSCALL_FILTER_SET_MODULE] = {
550 .name = "@module",
551 .help = "Loading and unloading of kernel modules",
552 .value =
553 "delete_module\0"
554 "finit_module\0"
555 "init_module\0"
556 },
557 [SYSCALL_FILTER_SET_MOUNT] = {
558 .name = "@mount",
559 .help = "Mounting and unmounting of file systems",
560 .value =
561 "chroot\0"
562 "mount\0"
563 "pivot_root\0"
564 "umount\0"
565 "umount2\0"
566 },
567 [SYSCALL_FILTER_SET_NETWORK_IO] = {
568 .name = "@network-io",
569 .help = "Network or Unix socket IO, should not be needed if not network facing",
570 .value =
571 "accept\0"
572 "accept4\0"
573 "bind\0"
574 "connect\0"
575 "getpeername\0"
576 "getsockname\0"
577 "getsockopt\0"
578 "listen\0"
579 "recv\0"
580 "recvfrom\0"
581 "recvmmsg\0"
582 "recvmsg\0"
583 "send\0"
584 "sendmmsg\0"
585 "sendmsg\0"
586 "sendto\0"
587 "setsockopt\0"
588 "shutdown\0"
589 "socket\0"
590 "socketcall\0"
591 "socketpair\0"
592 },
593 [SYSCALL_FILTER_SET_OBSOLETE] = {
594 /* some unknown even to libseccomp */
595 .name = "@obsolete",
596 .help = "Unusual, obsolete or unimplemented system calls",
597 .value =
598 "_sysctl\0"
599 "afs_syscall\0"
600 "bdflush\0"
601 "break\0"
602 "create_module\0"
603 "ftime\0"
604 "get_kernel_syms\0"
605 "getpmsg\0"
606 "gtty\0"
607 "idle\0"
608 "lock\0"
609 "mpx\0"
610 "prof\0"
611 "profil\0"
612 "putpmsg\0"
613 "query_module\0"
614 "security\0"
615 "sgetmask\0"
616 "ssetmask\0"
617 "stty\0"
618 "sysfs\0"
619 "tuxcall\0"
620 "ulimit\0"
621 "uselib\0"
622 "ustat\0"
623 "vserver\0"
624 },
625 [SYSCALL_FILTER_SET_PRIVILEGED] = {
626 .name = "@privileged",
627 .help = "All system calls which need super-user capabilities",
628 .value =
629 "@chown\0"
630 "@clock\0"
631 "@module\0"
632 "@raw-io\0"
633 "@reboot\0"
634 "@swap\0"
635 "_sysctl\0"
636 "acct\0"
637 "bpf\0"
638 "capset\0"
639 "chroot\0"
640 "nfsservctl\0"
641 "pivot_root\0"
642 "quotactl\0"
643 "setdomainname\0"
644 "setfsuid\0"
645 "setfsuid32\0"
646 "setgroups\0"
647 "setgroups32\0"
648 "sethostname\0"
649 "setresuid\0"
650 "setresuid32\0"
651 "setreuid\0"
652 "setreuid32\0"
653 "setuid\0"
654 "setuid32\0"
655 "vhangup\0"
656 },
657 [SYSCALL_FILTER_SET_PROCESS] = {
658 .name = "@process",
659 .help = "Process control, execution, namespaceing operations",
660 .value =
661 "arch_prctl\0"
662 "capget\0" /* Able to query arbitrary processes */
663 "clone\0"
664 "execveat\0"
665 "fork\0"
666 "getrusage\0"
667 "kill\0"
668 "prctl\0"
669 "rt_sigqueueinfo\0"
670 "rt_tgsigqueueinfo\0"
671 "setns\0"
672 "tgkill\0"
673 "times\0"
674 "tkill\0"
675 "unshare\0"
676 "vfork\0"
677 "wait4\0"
678 "waitid\0"
679 "waitpid\0"
680 },
681 [SYSCALL_FILTER_SET_RAW_IO] = {
682 .name = "@raw-io",
683 .help = "Raw I/O port access",
684 .value =
685 "ioperm\0"
686 "iopl\0"
687 "pciconfig_iobase\0"
688 "pciconfig_read\0"
689 "pciconfig_write\0"
690 #ifdef __NR_s390_pci_mmio_read
691 "s390_pci_mmio_read\0"
692 #endif
693 #ifdef __NR_s390_pci_mmio_write
694 "s390_pci_mmio_write\0"
695 #endif
696 },
697 [SYSCALL_FILTER_SET_REBOOT] = {
698 .name = "@reboot",
699 .help = "Reboot and reboot preparation/kexec",
700 .value =
701 "kexec_file_load\0"
702 "kexec_load\0"
703 "reboot\0"
704 },
705 [SYSCALL_FILTER_SET_RESOURCES] = {
706 .name = "@resources",
707 .help = "Alter resource settings",
708 .value =
709 "ioprio_set\0"
710 "mbind\0"
711 "migrate_pages\0"
712 "move_pages\0"
713 "nice\0"
714 "sched_setaffinity\0"
715 "sched_setattr\0"
716 "sched_setparam\0"
717 "sched_setscheduler\0"
718 "set_mempolicy\0"
719 "setpriority\0"
720 "setrlimit\0"
721 },
722 [SYSCALL_FILTER_SET_SETUID] = {
723 .name = "@setuid",
724 .help = "Operations for changing user/group credentials",
725 .value =
726 "setgid\0"
727 "setgid32\0"
728 "setgroups\0"
729 "setgroups32\0"
730 "setregid\0"
731 "setregid32\0"
732 "setresgid\0"
733 "setresgid32\0"
734 "setresuid\0"
735 "setresuid32\0"
736 "setreuid\0"
737 "setreuid32\0"
738 "setuid\0"
739 "setuid32\0"
740 },
741 [SYSCALL_FILTER_SET_SIGNAL] = {
742 .name = "@signal",
743 .help = "Process signal handling",
744 .value =
745 "rt_sigaction\0"
746 "rt_sigpending\0"
747 "rt_sigprocmask\0"
748 "rt_sigsuspend\0"
749 "rt_sigtimedwait\0"
750 "sigaction\0"
751 "sigaltstack\0"
752 "signal\0"
753 "signalfd\0"
754 "signalfd4\0"
755 "sigpending\0"
756 "sigprocmask\0"
757 "sigsuspend\0"
758 },
759 [SYSCALL_FILTER_SET_SWAP] = {
760 .name = "@swap",
761 .help = "Enable/disable swap devices",
762 .value =
763 "swapoff\0"
764 "swapon\0"
765 },
766 [SYSCALL_FILTER_SET_SYNC] = {
767 .name = "@sync",
768 .help = "Synchronize files and memory to storage",
769 .value =
770 "fdatasync\0"
771 "fsync\0"
772 "msync\0"
773 "sync\0"
774 "sync_file_range\0"
775 "syncfs\0"
776 },
777 [SYSCALL_FILTER_SET_TIMER] = {
778 .name = "@timer",
779 .help = "Schedule operations by time",
780 .value =
781 "alarm\0"
782 "getitimer\0"
783 "setitimer\0"
784 "timer_create\0"
785 "timer_delete\0"
786 "timer_getoverrun\0"
787 "timer_gettime\0"
788 "timer_settime\0"
789 "timerfd_create\0"
790 "timerfd_gettime\0"
791 "timerfd_settime\0"
792 "times\0"
793 },
794 };
795
796 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
797 unsigned i;
798
799 if (isempty(name) || name[0] != '@')
800 return NULL;
801
802 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
803 if (streq(syscall_filter_sets[i].name, name))
804 return syscall_filter_sets + i;
805
806 return NULL;
807 }
808
809 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
810
811 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
812 int r;
813
814 assert(seccomp);
815 assert(name);
816
817 if (strv_contains(exclude, name))
818 return 0;
819
820 if (name[0] == '@') {
821 const SyscallFilterSet *other;
822
823 other = syscall_filter_set_find(name);
824 if (!other) {
825 log_debug("Filter set %s is not known!", name);
826 return -EINVAL;
827 }
828
829 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
830 if (r < 0)
831 return r;
832 } else {
833 int id;
834
835 id = seccomp_syscall_resolve_name(name);
836 if (id == __NR_SCMP_ERROR) {
837 log_debug("System call %s is not known, ignoring.", name);
838 return 0;
839 }
840
841 r = seccomp_rule_add_exact(seccomp, action, id, 0);
842 if (r < 0)
843 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
844 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
845 }
846
847 return 0;
848 }
849
850 static int seccomp_add_syscall_filter_set(
851 scmp_filter_ctx seccomp,
852 const SyscallFilterSet *set,
853 uint32_t action,
854 char **exclude) {
855
856 const char *sys;
857 int r;
858
859 assert(seccomp);
860 assert(set);
861
862 NULSTR_FOREACH(sys, set->value) {
863 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
864 if (r < 0)
865 return r;
866 }
867
868 return 0;
869 }
870
871 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
872 uint32_t arch;
873 int r;
874
875 assert(set);
876
877 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
878 * earch local arch. */
879
880 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
881 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
882
883 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
884
885 r = seccomp_init_for_arch(&seccomp, arch, default_action);
886 if (r < 0)
887 return r;
888
889 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
890 if (r < 0) {
891 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
892 continue;
893 }
894
895 r = seccomp_load(seccomp);
896 if (IN_SET(r, -EPERM, -EACCES))
897 return r;
898 if (r < 0)
899 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
900 }
901
902 return 0;
903 }
904
905 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action) {
906 uint32_t arch;
907 int r;
908
909 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
910 * SyscallFilterSet* table. */
911
912 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
913 return 0;
914
915 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
916 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
917 Iterator i;
918 void *id, *val;
919
920 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
921
922 r = seccomp_init_for_arch(&seccomp, arch, default_action);
923 if (r < 0)
924 return r;
925
926 HASHMAP_FOREACH_KEY(val, id, set, i) {
927 uint32_t a = action;
928 int e = PTR_TO_INT(val);
929
930 if (action != SCMP_ACT_ALLOW && e >= 0)
931 a = SCMP_ACT_ERRNO(e);
932
933 r = seccomp_rule_add_exact(seccomp, a, PTR_TO_INT(id) - 1, 0);
934 if (r < 0) {
935 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
936 _cleanup_free_ char *n = NULL;
937
938 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
939 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
940 }
941 }
942
943 r = seccomp_load(seccomp);
944 if (IN_SET(r, -EPERM, -EACCES))
945 return r;
946 if (r < 0)
947 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
948 }
949
950 return 0;
951 }
952
953 int seccomp_restrict_namespaces(unsigned long retain) {
954 uint32_t arch;
955 int r;
956
957 if (log_get_max_level() >= LOG_DEBUG) {
958 _cleanup_free_ char *s = NULL;
959
960 (void) namespace_flag_to_string_many(retain, &s);
961 log_debug("Restricting namespace to: %s.", strna(s));
962 }
963
964 /* NOOP? */
965 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
966 return 0;
967
968 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
969 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
970 unsigned i;
971
972 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
973
974 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
975 if (r < 0)
976 return r;
977
978 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
979 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
980 * altogether. */
981 r = seccomp_rule_add_exact(
982 seccomp,
983 SCMP_ACT_ERRNO(EPERM),
984 SCMP_SYS(setns),
985 0);
986 else
987 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
988 * special invocation with a zero flags argument, right here. */
989 r = seccomp_rule_add_exact(
990 seccomp,
991 SCMP_ACT_ERRNO(EPERM),
992 SCMP_SYS(setns),
993 1,
994 SCMP_A1(SCMP_CMP_EQ, 0));
995 if (r < 0) {
996 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
997 continue;
998 }
999
1000 for (i = 0; namespace_flag_map[i].name; i++) {
1001 unsigned long f;
1002
1003 f = namespace_flag_map[i].flag;
1004 if ((retain & f) == f) {
1005 log_debug("Permitting %s.", namespace_flag_map[i].name);
1006 continue;
1007 }
1008
1009 log_debug("Blocking %s.", namespace_flag_map[i].name);
1010
1011 r = seccomp_rule_add_exact(
1012 seccomp,
1013 SCMP_ACT_ERRNO(EPERM),
1014 SCMP_SYS(unshare),
1015 1,
1016 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1017 if (r < 0) {
1018 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1019 break;
1020 }
1021
1022 /* On s390/s390x the first two parameters to clone are switched */
1023 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1024 r = seccomp_rule_add_exact(
1025 seccomp,
1026 SCMP_ACT_ERRNO(EPERM),
1027 SCMP_SYS(clone),
1028 1,
1029 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1030 else
1031 r = seccomp_rule_add_exact(
1032 seccomp,
1033 SCMP_ACT_ERRNO(EPERM),
1034 SCMP_SYS(clone),
1035 1,
1036 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1037 if (r < 0) {
1038 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1039 break;
1040 }
1041
1042 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1043 r = seccomp_rule_add_exact(
1044 seccomp,
1045 SCMP_ACT_ERRNO(EPERM),
1046 SCMP_SYS(setns),
1047 1,
1048 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1049 if (r < 0) {
1050 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1051 break;
1052 }
1053 }
1054 }
1055 if (r < 0)
1056 continue;
1057
1058 r = seccomp_load(seccomp);
1059 if (IN_SET(r, -EPERM, -EACCES))
1060 return r;
1061 if (r < 0)
1062 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1063 }
1064
1065 return 0;
1066 }
1067
1068 int seccomp_protect_sysctl(void) {
1069 uint32_t arch;
1070 int r;
1071
1072 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1073 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1074
1075 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1076
1077 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1078 /* No _sysctl syscall */
1079 continue;
1080
1081 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1082 if (r < 0)
1083 return r;
1084
1085 r = seccomp_rule_add_exact(
1086 seccomp,
1087 SCMP_ACT_ERRNO(EPERM),
1088 SCMP_SYS(_sysctl),
1089 0);
1090 if (r < 0) {
1091 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1092 continue;
1093 }
1094
1095 r = seccomp_load(seccomp);
1096 if (IN_SET(r, -EPERM, -EACCES))
1097 return r;
1098 if (r < 0)
1099 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1100 }
1101
1102 return 0;
1103 }
1104
1105 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1106 uint32_t arch;
1107 int r;
1108
1109 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1110 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1111 bool supported;
1112 Iterator i;
1113
1114 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1115
1116 switch (arch) {
1117
1118 case SCMP_ARCH_X86_64:
1119 case SCMP_ARCH_X32:
1120 case SCMP_ARCH_ARM:
1121 case SCMP_ARCH_AARCH64:
1122 case SCMP_ARCH_PPC64:
1123 case SCMP_ARCH_PPC64LE:
1124 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1125 supported = true;
1126 break;
1127
1128 case SCMP_ARCH_S390:
1129 case SCMP_ARCH_S390X:
1130 case SCMP_ARCH_PPC:
1131 case SCMP_ARCH_X86:
1132 default:
1133 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1134 * don't know */
1135 supported = false;
1136 break;
1137 }
1138
1139 if (!supported)
1140 continue;
1141
1142 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1143 if (r < 0)
1144 return r;
1145
1146 if (whitelist) {
1147 int af, first = 0, last = 0;
1148 void *afp;
1149
1150 /* If this is a whitelist, we first block the address families that are out of range and then
1151 * everything that is not in the set. First, we find the lowest and highest address family in
1152 * the set. */
1153
1154 SET_FOREACH(afp, address_families, i) {
1155 af = PTR_TO_INT(afp);
1156
1157 if (af <= 0 || af >= af_max())
1158 continue;
1159
1160 if (first == 0 || af < first)
1161 first = af;
1162
1163 if (last == 0 || af > last)
1164 last = af;
1165 }
1166
1167 assert((first == 0) == (last == 0));
1168
1169 if (first == 0) {
1170
1171 /* No entries in the valid range, block everything */
1172 r = seccomp_rule_add_exact(
1173 seccomp,
1174 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1175 SCMP_SYS(socket),
1176 0);
1177 if (r < 0) {
1178 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1179 continue;
1180 }
1181
1182 } else {
1183
1184 /* Block everything below the first entry */
1185 r = seccomp_rule_add_exact(
1186 seccomp,
1187 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1188 SCMP_SYS(socket),
1189 1,
1190 SCMP_A0(SCMP_CMP_LT, first));
1191 if (r < 0) {
1192 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1193 continue;
1194 }
1195
1196 /* Block everything above the last entry */
1197 r = seccomp_rule_add_exact(
1198 seccomp,
1199 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1200 SCMP_SYS(socket),
1201 1,
1202 SCMP_A0(SCMP_CMP_GT, last));
1203 if (r < 0) {
1204 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1205 continue;
1206 }
1207
1208 /* Block everything between the first and last entry */
1209 for (af = 1; af < af_max(); af++) {
1210
1211 if (set_contains(address_families, INT_TO_PTR(af)))
1212 continue;
1213
1214 r = seccomp_rule_add_exact(
1215 seccomp,
1216 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1217 SCMP_SYS(socket),
1218 1,
1219 SCMP_A0(SCMP_CMP_EQ, af));
1220 if (r < 0)
1221 break;
1222 }
1223 if (r < 0) {
1224 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1225 continue;
1226 }
1227 }
1228
1229 } else {
1230 void *af;
1231
1232 /* If this is a blacklist, then generate one rule for
1233 * each address family that are then combined in OR
1234 * checks. */
1235
1236 SET_FOREACH(af, address_families, i) {
1237
1238 r = seccomp_rule_add_exact(
1239 seccomp,
1240 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1241 SCMP_SYS(socket),
1242 1,
1243 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1244 if (r < 0)
1245 break;
1246 }
1247 if (r < 0) {
1248 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1249 continue;
1250 }
1251 }
1252
1253 r = seccomp_load(seccomp);
1254 if (IN_SET(r, -EPERM, -EACCES))
1255 return r;
1256 if (r < 0)
1257 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1258 }
1259
1260 return 0;
1261 }
1262
1263 int seccomp_restrict_realtime(void) {
1264 static const int permitted_policies[] = {
1265 SCHED_OTHER,
1266 SCHED_BATCH,
1267 SCHED_IDLE,
1268 };
1269
1270 int r, max_policy = 0;
1271 uint32_t arch;
1272 unsigned i;
1273
1274 /* Determine the highest policy constant we want to allow */
1275 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1276 if (permitted_policies[i] > max_policy)
1277 max_policy = permitted_policies[i];
1278
1279 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1280 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1281 int p;
1282
1283 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1284
1285 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1286 if (r < 0)
1287 return r;
1288
1289 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1290 * whitelist. */
1291 for (p = 0; p < max_policy; p++) {
1292 bool good = false;
1293
1294 /* Check if this is in the whitelist. */
1295 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1296 if (permitted_policies[i] == p) {
1297 good = true;
1298 break;
1299 }
1300
1301 if (good)
1302 continue;
1303
1304 /* Deny this policy */
1305 r = seccomp_rule_add_exact(
1306 seccomp,
1307 SCMP_ACT_ERRNO(EPERM),
1308 SCMP_SYS(sched_setscheduler),
1309 1,
1310 SCMP_A1(SCMP_CMP_EQ, p));
1311 if (r < 0) {
1312 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1313 continue;
1314 }
1315 }
1316
1317 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1318 * unsigned here, hence no need no check for < 0 values. */
1319 r = seccomp_rule_add_exact(
1320 seccomp,
1321 SCMP_ACT_ERRNO(EPERM),
1322 SCMP_SYS(sched_setscheduler),
1323 1,
1324 SCMP_A1(SCMP_CMP_GT, max_policy));
1325 if (r < 0) {
1326 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1327 continue;
1328 }
1329
1330 r = seccomp_load(seccomp);
1331 if (IN_SET(r, -EPERM, -EACCES))
1332 return r;
1333 if (r < 0)
1334 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1335 }
1336
1337 return 0;
1338 }
1339
1340 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1341 uint32_t arch,
1342 int nr,
1343 unsigned int arg_cnt,
1344 const struct scmp_arg_cmp arg) {
1345 int r;
1346
1347 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1348 if (r < 0) {
1349 _cleanup_free_ char *n = NULL;
1350
1351 n = seccomp_syscall_resolve_num_arch(arch, nr);
1352 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1353 strna(n),
1354 seccomp_arch_to_string(arch));
1355 }
1356
1357 return r;
1358 }
1359
1360 /* For known architectures, check that syscalls are indeed defined or not. */
1361 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1362 assert_cc(SCMP_SYS(shmget) > 0);
1363 assert_cc(SCMP_SYS(shmat) > 0);
1364 assert_cc(SCMP_SYS(shmdt) > 0);
1365 #elif defined(__i386__) || defined(__powerpc64__)
1366 assert_cc(SCMP_SYS(shmget) < 0);
1367 assert_cc(SCMP_SYS(shmat) < 0);
1368 assert_cc(SCMP_SYS(shmdt) < 0);
1369 #endif
1370
1371 int seccomp_memory_deny_write_execute(void) {
1372
1373 uint32_t arch;
1374 int r;
1375
1376 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1377 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1378 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1379
1380 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1381
1382 switch (arch) {
1383
1384 case SCMP_ARCH_X86:
1385 filter_syscall = SCMP_SYS(mmap2);
1386 block_syscall = SCMP_SYS(mmap);
1387 break;
1388
1389 case SCMP_ARCH_PPC64:
1390 case SCMP_ARCH_PPC64LE:
1391 filter_syscall = SCMP_SYS(mmap);
1392
1393 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1394 * We ignore that here, which means there's still a way to get writable/executable
1395 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1396
1397 break;
1398
1399 case SCMP_ARCH_ARM:
1400 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1401 shmat_syscall = SCMP_SYS(shmat);
1402 break;
1403
1404 case SCMP_ARCH_X86_64:
1405 case SCMP_ARCH_X32:
1406 case SCMP_ARCH_AARCH64:
1407 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1408 shmat_syscall = SCMP_SYS(shmat);
1409 break;
1410
1411 /* Please add more definitions here, if you port systemd to other architectures! */
1412
1413 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1414 #warning "Consider adding the right mmap() syscall definitions here!"
1415 #endif
1416 }
1417
1418 /* Can't filter mmap() on this arch, then skip it */
1419 if (filter_syscall == 0)
1420 continue;
1421
1422 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1423 if (r < 0)
1424 return r;
1425
1426 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1427 1,
1428 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1429 if (r < 0)
1430 continue;
1431
1432 if (block_syscall != 0) {
1433 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1434 if (r < 0)
1435 continue;
1436 }
1437
1438 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1439 1,
1440 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1441 if (r < 0)
1442 continue;
1443
1444 #ifdef __NR_pkey_mprotect
1445 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1446 1,
1447 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1448 if (r < 0)
1449 continue;
1450 #endif
1451
1452 if (shmat_syscall != 0) {
1453 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1454 1,
1455 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1456 if (r < 0)
1457 continue;
1458 }
1459
1460 r = seccomp_load(seccomp);
1461 if (IN_SET(r, -EPERM, -EACCES))
1462 return r;
1463 if (r < 0)
1464 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1465 }
1466
1467 return 0;
1468 }
1469
1470 int seccomp_restrict_archs(Set *archs) {
1471 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1472 Iterator i;
1473 void *id;
1474 int r;
1475
1476 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1477 * list. */
1478
1479 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1480 if (!seccomp)
1481 return -ENOMEM;
1482
1483 SET_FOREACH(id, archs, i) {
1484 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1485 if (r == -EEXIST)
1486 continue;
1487 if (r < 0)
1488 return r;
1489 }
1490
1491 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1492 if (r < 0)
1493 return r;
1494
1495 r = seccomp_load(seccomp);
1496 if (IN_SET(r, -EPERM, -EACCES))
1497 return r;
1498 if (r < 0)
1499 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1500
1501 return 0;
1502 }
1503
1504 int parse_syscall_archs(char **l, Set **archs) {
1505 _cleanup_set_free_ Set *_archs;
1506 char **s;
1507 int r;
1508
1509 assert(l);
1510 assert(archs);
1511
1512 r = set_ensure_allocated(&_archs, NULL);
1513 if (r < 0)
1514 return r;
1515
1516 STRV_FOREACH(s, l) {
1517 uint32_t a;
1518
1519 r = seccomp_arch_from_string(*s, &a);
1520 if (r < 0)
1521 return -EINVAL;
1522
1523 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1524 if (r < 0)
1525 return -ENOMEM;
1526 }
1527
1528 *archs = _archs;
1529 _archs = NULL;
1530
1531 return 0;
1532 }
1533
1534 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1535 const char *i;
1536 int r;
1537
1538 assert(set);
1539
1540 NULSTR_FOREACH(i, set->value) {
1541
1542 if (i[0] == '@') {
1543 const SyscallFilterSet *more;
1544
1545 more = syscall_filter_set_find(i);
1546 if (!more)
1547 return -ENXIO;
1548
1549 r = seccomp_filter_set_add(filter, add, more);
1550 if (r < 0)
1551 return r;
1552 } else {
1553 int id;
1554
1555 id = seccomp_syscall_resolve_name(i);
1556 if (id == __NR_SCMP_ERROR) {
1557 log_debug("Couldn't resolve system call, ignoring: %s", i);
1558 continue;
1559 }
1560
1561 if (add) {
1562 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1563 if (r < 0)
1564 return r;
1565 } else
1566 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1567 }
1568 }
1569
1570 return 0;
1571 }
1572
1573 int seccomp_lock_personality(unsigned long personality) {
1574 uint32_t arch;
1575 int r;
1576
1577 if (personality >= PERSONALITY_INVALID)
1578 return -EINVAL;
1579
1580 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1581 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1582
1583 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1584 if (r < 0)
1585 return r;
1586
1587 r = seccomp_rule_add_exact(
1588 seccomp,
1589 SCMP_ACT_ERRNO(EPERM),
1590 SCMP_SYS(personality),
1591 1,
1592 SCMP_A0(SCMP_CMP_NE, personality));
1593 if (r < 0) {
1594 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1595 continue;
1596 }
1597
1598 r = seccomp_load(seccomp);
1599 if (IN_SET(r, -EPERM, -EACCES))
1600 return r;
1601 if (r < 0)
1602 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1603 }
1604
1605 return 0;
1606 }