]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Update mailmap and contributor list (#7006)
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <linux/seccomp.h>
22 #include <seccomp.h>
23 #include <stddef.h>
24 #include <sys/mman.h>
25 #include <sys/prctl.h>
26 #include <sys/shm.h>
27
28 #include "af-list.h"
29 #include "alloc-util.h"
30 #include "macro.h"
31 #include "nsflags.h"
32 #include "process-util.h"
33 #include "seccomp-util.h"
34 #include "set.h"
35 #include "string-util.h"
36 #include "strv.h"
37 #include "util.h"
38 #include "errno-list.h"
39
40 const uint32_t seccomp_local_archs[] = {
41
42 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
43
44 #if defined(__x86_64__) && defined(__ILP32__)
45 SCMP_ARCH_X86,
46 SCMP_ARCH_X86_64,
47 SCMP_ARCH_X32, /* native */
48 #elif defined(__x86_64__) && !defined(__ILP32__)
49 SCMP_ARCH_X86,
50 SCMP_ARCH_X32,
51 SCMP_ARCH_X86_64, /* native */
52 #elif defined(__i386__)
53 SCMP_ARCH_X86,
54 #elif defined(__aarch64__)
55 SCMP_ARCH_ARM,
56 SCMP_ARCH_AARCH64, /* native */
57 #elif defined(__arm__)
58 SCMP_ARCH_ARM,
59 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
63 SCMP_ARCH_MIPS,
64 SCMP_ARCH_MIPSEL, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64N32,
69 SCMP_ARCH_MIPS64N32,
70 SCMP_ARCH_MIPSEL64,
71 SCMP_ARCH_MIPS64, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64N32,
76 SCMP_ARCH_MIPSEL64N32,
77 SCMP_ARCH_MIPS64,
78 SCMP_ARCH_MIPSEL64, /* native */
79 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPSEL,
81 SCMP_ARCH_MIPS,
82 SCMP_ARCH_MIPSEL64,
83 SCMP_ARCH_MIPS64,
84 SCMP_ARCH_MIPSEL64N32,
85 SCMP_ARCH_MIPS64N32, /* native */
86 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
87 SCMP_ARCH_MIPS,
88 SCMP_ARCH_MIPSEL,
89 SCMP_ARCH_MIPS64,
90 SCMP_ARCH_MIPSEL64,
91 SCMP_ARCH_MIPS64N32,
92 SCMP_ARCH_MIPSEL64N32, /* native */
93 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
94 SCMP_ARCH_PPC,
95 SCMP_ARCH_PPC64LE,
96 SCMP_ARCH_PPC64, /* native */
97 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
98 SCMP_ARCH_PPC,
99 SCMP_ARCH_PPC64,
100 SCMP_ARCH_PPC64LE, /* native */
101 #elif defined(__powerpc__)
102 SCMP_ARCH_PPC,
103 #elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106 #elif defined(__s390__)
107 SCMP_ARCH_S390,
108 #endif
109 (uint32_t) -1
110 };
111
112 const char* seccomp_arch_to_string(uint32_t c) {
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
117
118 switch(c) {
119 case SCMP_ARCH_NATIVE:
120 return "native";
121 case SCMP_ARCH_X86:
122 return "x86";
123 case SCMP_ARCH_X86_64:
124 return "x86-64";
125 case SCMP_ARCH_X32:
126 return "x32";
127 case SCMP_ARCH_ARM:
128 return "arm";
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
131 case SCMP_ARCH_MIPS:
132 return "mips";
133 case SCMP_ARCH_MIPS64:
134 return "mips64";
135 case SCMP_ARCH_MIPS64N32:
136 return "mips64-n32";
137 case SCMP_ARCH_MIPSEL:
138 return "mips-le";
139 case SCMP_ARCH_MIPSEL64:
140 return "mips64-le";
141 case SCMP_ARCH_MIPSEL64N32:
142 return "mips64-le-n32";
143 case SCMP_ARCH_PPC:
144 return "ppc";
145 case SCMP_ARCH_PPC64:
146 return "ppc64";
147 case SCMP_ARCH_PPC64LE:
148 return "ppc64-le";
149 case SCMP_ARCH_S390:
150 return "s390";
151 case SCMP_ARCH_S390X:
152 return "s390x";
153 default:
154 return NULL;
155 }
156 }
157
158 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
159 if (!n)
160 return -EINVAL;
161
162 assert(ret);
163
164 if (streq(n, "native"))
165 *ret = SCMP_ARCH_NATIVE;
166 else if (streq(n, "x86"))
167 *ret = SCMP_ARCH_X86;
168 else if (streq(n, "x86-64"))
169 *ret = SCMP_ARCH_X86_64;
170 else if (streq(n, "x32"))
171 *ret = SCMP_ARCH_X32;
172 else if (streq(n, "arm"))
173 *ret = SCMP_ARCH_ARM;
174 else if (streq(n, "arm64"))
175 *ret = SCMP_ARCH_AARCH64;
176 else if (streq(n, "mips"))
177 *ret = SCMP_ARCH_MIPS;
178 else if (streq(n, "mips64"))
179 *ret = SCMP_ARCH_MIPS64;
180 else if (streq(n, "mips64-n32"))
181 *ret = SCMP_ARCH_MIPS64N32;
182 else if (streq(n, "mips-le"))
183 *ret = SCMP_ARCH_MIPSEL;
184 else if (streq(n, "mips64-le"))
185 *ret = SCMP_ARCH_MIPSEL64;
186 else if (streq(n, "mips64-le-n32"))
187 *ret = SCMP_ARCH_MIPSEL64N32;
188 else if (streq(n, "ppc"))
189 *ret = SCMP_ARCH_PPC;
190 else if (streq(n, "ppc64"))
191 *ret = SCMP_ARCH_PPC64;
192 else if (streq(n, "ppc64-le"))
193 *ret = SCMP_ARCH_PPC64LE;
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
198 else
199 return -EINVAL;
200
201 return 0;
202 }
203
204 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
205 scmp_filter_ctx seccomp;
206 int r;
207
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
219 if (r < 0)
220 goto finish;
221
222 r = seccomp_arch_add(seccomp, arch);
223 if (r < 0)
224 goto finish;
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
235 if (r < 0)
236 goto finish;
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
240 goto finish;
241
242 *ret = seccomp;
243 return 0;
244
245 finish:
246 seccomp_release(seccomp);
247 return r;
248 }
249
250 static bool is_basic_seccomp_available(void) {
251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
252 }
253
254 static bool is_seccomp_filter_available(void) {
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
257 }
258
259 bool is_seccomp_available(void) {
260 static int cached_enabled = -1;
261
262 if (cached_enabled < 0)
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
267 return cached_enabled;
268 }
269
270 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
271 [SYSCALL_FILTER_SET_DEFAULT] = {
272 .name = "@default",
273 .help = "System calls that are always permitted",
274 .value =
275 "clock_getres\0"
276 "clock_gettime\0"
277 "clock_nanosleep\0"
278 "execve\0"
279 "exit\0"
280 "exit_group\0"
281 "futex\0"
282 "get_robust_list\0"
283 "get_thread_area\0"
284 "getegid\0"
285 "getegid32\0"
286 "geteuid\0"
287 "geteuid32\0"
288 "getgid\0"
289 "getgid32\0"
290 "getgroups\0"
291 "getgroups32\0"
292 "getpgid\0"
293 "getpgrp\0"
294 "getpid\0"
295 "getppid\0"
296 "getresgid\0"
297 "getresgid32\0"
298 "getresuid\0"
299 "getresuid32\0"
300 "getrlimit\0" /* make sure processes can query stack size and such */
301 "getsid\0"
302 "gettid\0"
303 "gettimeofday\0"
304 "getuid\0"
305 "getuid32\0"
306 "membarrier\0"
307 "nanosleep\0"
308 "pause\0"
309 "prlimit64\0"
310 "restart_syscall\0"
311 "rt_sigreturn\0"
312 "sched_yield\0"
313 "set_robust_list\0"
314 "set_thread_area\0"
315 "set_tid_address\0"
316 "sigreturn\0"
317 "time\0"
318 "ugetrlimit\0"
319 },
320 [SYSCALL_FILTER_SET_BASIC_IO] = {
321 .name = "@basic-io",
322 .help = "Basic IO",
323 .value =
324 "_llseek\0"
325 "close\0"
326 "dup\0"
327 "dup2\0"
328 "dup3\0"
329 "lseek\0"
330 "pread64\0"
331 "preadv\0"
332 "pwrite64\0"
333 "pwritev\0"
334 "read\0"
335 "readv\0"
336 "write\0"
337 "writev\0"
338 },
339 [SYSCALL_FILTER_SET_CLOCK] = {
340 .name = "@clock",
341 .help = "Change the system time",
342 .value =
343 "adjtimex\0"
344 "clock_adjtime\0"
345 "clock_settime\0"
346 "settimeofday\0"
347 "stime\0"
348 },
349 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
350 .name = "@cpu-emulation",
351 .help = "System calls for CPU emulation functionality",
352 .value =
353 "modify_ldt\0"
354 "subpage_prot\0"
355 "switch_endian\0"
356 "vm86\0"
357 "vm86old\0"
358 },
359 [SYSCALL_FILTER_SET_DEBUG] = {
360 .name = "@debug",
361 .help = "Debugging, performance monitoring and tracing functionality",
362 .value =
363 "lookup_dcookie\0"
364 "perf_event_open\0"
365 "process_vm_readv\0"
366 "process_vm_writev\0"
367 "ptrace\0"
368 "rtas\0"
369 #ifdef __NR_s390_runtime_instr
370 "s390_runtime_instr\0"
371 #endif
372 "sys_debug_setcontext\0"
373 },
374 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
375 .name = "@file-system",
376 .help = "File system operations",
377 .value =
378 "access\0"
379 "chdir\0"
380 "chmod\0"
381 "close\0"
382 "creat\0"
383 "faccessat\0"
384 "fallocate\0"
385 "fchdir\0"
386 "fchmod\0"
387 "fchmodat\0"
388 "fcntl\0"
389 "fcntl64\0"
390 "fgetxattr\0"
391 "flistxattr\0"
392 "fremovexattr\0"
393 "fsetxattr\0"
394 "fstat\0"
395 "fstat64\0"
396 "fstatat64\0"
397 "fstatfs\0"
398 "fstatfs64\0"
399 "ftruncate\0"
400 "ftruncate64\0"
401 "futimesat\0"
402 "getcwd\0"
403 "getdents\0"
404 "getdents64\0"
405 "getxattr\0"
406 "inotify_add_watch\0"
407 "inotify_init\0"
408 "inotify_init1\0"
409 "inotify_rm_watch\0"
410 "lgetxattr\0"
411 "link\0"
412 "linkat\0"
413 "listxattr\0"
414 "llistxattr\0"
415 "lremovexattr\0"
416 "lsetxattr\0"
417 "lstat\0"
418 "lstat64\0"
419 "mkdir\0"
420 "mkdirat\0"
421 "mknod\0"
422 "mknodat\0"
423 "mmap\0"
424 "mmap2\0"
425 "munmap\0"
426 "newfstatat\0"
427 "oldfstat\0"
428 "oldlstat\0"
429 "oldstat\0"
430 "open\0"
431 "openat\0"
432 "readlink\0"
433 "readlinkat\0"
434 "removexattr\0"
435 "rename\0"
436 "renameat\0"
437 "renameat2\0"
438 "rmdir\0"
439 "setxattr\0"
440 "stat\0"
441 "stat64\0"
442 "statfs\0"
443 "statfs64\0"
444 #ifdef __PNR_statx
445 "statx\0"
446 #endif
447 "symlink\0"
448 "symlinkat\0"
449 "truncate\0"
450 "truncate64\0"
451 "unlink\0"
452 "unlinkat\0"
453 "utime\0"
454 "utimensat\0"
455 "utimes\0"
456 },
457 [SYSCALL_FILTER_SET_IO_EVENT] = {
458 .name = "@io-event",
459 .help = "Event loop system calls",
460 .value =
461 "_newselect\0"
462 "epoll_create\0"
463 "epoll_create1\0"
464 "epoll_ctl\0"
465 "epoll_ctl_old\0"
466 "epoll_pwait\0"
467 "epoll_wait\0"
468 "epoll_wait_old\0"
469 "eventfd\0"
470 "eventfd2\0"
471 "poll\0"
472 "ppoll\0"
473 "pselect6\0"
474 "select\0"
475 },
476 [SYSCALL_FILTER_SET_IPC] = {
477 .name = "@ipc",
478 .help = "SysV IPC, POSIX Message Queues or other IPC",
479 .value =
480 "ipc\0"
481 "memfd_create\0"
482 "mq_getsetattr\0"
483 "mq_notify\0"
484 "mq_open\0"
485 "mq_timedreceive\0"
486 "mq_timedsend\0"
487 "mq_unlink\0"
488 "msgctl\0"
489 "msgget\0"
490 "msgrcv\0"
491 "msgsnd\0"
492 "pipe\0"
493 "pipe2\0"
494 "process_vm_readv\0"
495 "process_vm_writev\0"
496 "semctl\0"
497 "semget\0"
498 "semop\0"
499 "semtimedop\0"
500 "shmat\0"
501 "shmctl\0"
502 "shmdt\0"
503 "shmget\0"
504 },
505 [SYSCALL_FILTER_SET_KEYRING] = {
506 .name = "@keyring",
507 .help = "Kernel keyring access",
508 .value =
509 "add_key\0"
510 "keyctl\0"
511 "request_key\0"
512 },
513 [SYSCALL_FILTER_SET_MEMLOCK] = {
514 .name = "@memlock",
515 .help = "Memory locking control",
516 .value =
517 "mlock\0"
518 "mlock2\0"
519 "mlockall\0"
520 "munlock\0"
521 "munlockall\0"
522 },
523 [SYSCALL_FILTER_SET_MODULE] = {
524 .name = "@module",
525 .help = "Loading and unloading of kernel modules",
526 .value =
527 "delete_module\0"
528 "finit_module\0"
529 "init_module\0"
530 },
531 [SYSCALL_FILTER_SET_MOUNT] = {
532 .name = "@mount",
533 .help = "Mounting and unmounting of file systems",
534 .value =
535 "chroot\0"
536 "mount\0"
537 "pivot_root\0"
538 "umount\0"
539 "umount2\0"
540 },
541 [SYSCALL_FILTER_SET_NETWORK_IO] = {
542 .name = "@network-io",
543 .help = "Network or Unix socket IO, should not be needed if not network facing",
544 .value =
545 "accept\0"
546 "accept4\0"
547 "bind\0"
548 "connect\0"
549 "getpeername\0"
550 "getsockname\0"
551 "getsockopt\0"
552 "listen\0"
553 "recv\0"
554 "recvfrom\0"
555 "recvmmsg\0"
556 "recvmsg\0"
557 "send\0"
558 "sendmmsg\0"
559 "sendmsg\0"
560 "sendto\0"
561 "setsockopt\0"
562 "shutdown\0"
563 "socket\0"
564 "socketcall\0"
565 "socketpair\0"
566 },
567 [SYSCALL_FILTER_SET_OBSOLETE] = {
568 /* some unknown even to libseccomp */
569 .name = "@obsolete",
570 .help = "Unusual, obsolete or unimplemented system calls",
571 .value =
572 "_sysctl\0"
573 "afs_syscall\0"
574 "bdflush\0"
575 "break\0"
576 "create_module\0"
577 "ftime\0"
578 "get_kernel_syms\0"
579 "getpmsg\0"
580 "gtty\0"
581 "idle\0"
582 "lock\0"
583 "mpx\0"
584 "prof\0"
585 "profil\0"
586 "putpmsg\0"
587 "query_module\0"
588 "security\0"
589 "sgetmask\0"
590 "ssetmask\0"
591 "stty\0"
592 "sysfs\0"
593 "tuxcall\0"
594 "ulimit\0"
595 "uselib\0"
596 "ustat\0"
597 "vserver\0"
598 },
599 [SYSCALL_FILTER_SET_PRIVILEGED] = {
600 .name = "@privileged",
601 .help = "All system calls which need super-user capabilities",
602 .value =
603 "@clock\0"
604 "@module\0"
605 "@raw-io\0"
606 "_sysctl\0"
607 "acct\0"
608 "bpf\0"
609 "capset\0"
610 "chown\0"
611 "chown32\0"
612 "chroot\0"
613 "fchown\0"
614 "fchown32\0"
615 "fchownat\0"
616 "kexec_file_load\0"
617 "kexec_load\0"
618 "lchown\0"
619 "lchown32\0"
620 "nfsservctl\0"
621 "pivot_root\0"
622 "quotactl\0"
623 "reboot\0"
624 "setdomainname\0"
625 "setfsuid\0"
626 "setfsuid32\0"
627 "setgroups\0"
628 "setgroups32\0"
629 "sethostname\0"
630 "setresuid\0"
631 "setresuid32\0"
632 "setreuid\0"
633 "setreuid32\0"
634 "setuid\0"
635 "setuid32\0"
636 "swapoff\0"
637 "swapon\0"
638 "vhangup\0"
639 },
640 [SYSCALL_FILTER_SET_PROCESS] = {
641 .name = "@process",
642 .help = "Process control, execution, namespaceing operations",
643 .value =
644 "arch_prctl\0"
645 "capget\0" /* Able to query arbitrary processes */
646 "clone\0"
647 "execveat\0"
648 "fork\0"
649 "getrusage\0"
650 "kill\0"
651 "prctl\0"
652 "rt_sigqueueinfo\0"
653 "rt_tgsigqueueinfo\0"
654 "setns\0"
655 "tgkill\0"
656 "times\0"
657 "tkill\0"
658 "unshare\0"
659 "vfork\0"
660 "wait4\0"
661 "waitid\0"
662 "waitpid\0"
663 },
664 [SYSCALL_FILTER_SET_RAW_IO] = {
665 .name = "@raw-io",
666 .help = "Raw I/O port access",
667 .value =
668 "ioperm\0"
669 "iopl\0"
670 "pciconfig_iobase\0"
671 "pciconfig_read\0"
672 "pciconfig_write\0"
673 #ifdef __NR_s390_pci_mmio_read
674 "s390_pci_mmio_read\0"
675 #endif
676 #ifdef __NR_s390_pci_mmio_write
677 "s390_pci_mmio_write\0"
678 #endif
679 },
680 [SYSCALL_FILTER_SET_REBOOT] = {
681 .name = "@reboot",
682 .help = "Reboot and reboot preparation/kexec",
683 .value =
684 "kexec\0"
685 "kexec_file_load\0"
686 "reboot\0"
687 },
688 [SYSCALL_FILTER_SET_RESOURCES] = {
689 .name = "@resources",
690 .help = "Alter resource settings",
691 .value =
692 "ioprio_set\0"
693 "mbind\0"
694 "migrate_pages\0"
695 "move_pages\0"
696 "nice\0"
697 "sched_setaffinity\0"
698 "sched_setattr\0"
699 "sched_setparam\0"
700 "sched_setscheduler\0"
701 "set_mempolicy\0"
702 "setpriority\0"
703 "setrlimit\0"
704 },
705 [SYSCALL_FILTER_SET_SETUID] = {
706 .name = "@setuid",
707 .help = "Operations for changing user/group credentials",
708 .value =
709 "setgid\0"
710 "setgid32\0"
711 "setgroups\0"
712 "setgroups32\0"
713 "setregid\0"
714 "setregid32\0"
715 "setresgid\0"
716 "setresgid32\0"
717 "setresuid\0"
718 "setresuid32\0"
719 "setreuid\0"
720 "setreuid32\0"
721 "setuid\0"
722 "setuid32\0"
723 },
724 [SYSCALL_FILTER_SET_SIGNAL] = {
725 .name = "@signal",
726 .help = "Process signal handling",
727 .value =
728 "rt_sigaction\0"
729 "rt_sigpending\0"
730 "rt_sigprocmask\0"
731 "rt_sigsuspend\0"
732 "rt_sigtimedwait\0"
733 "sigaction\0"
734 "sigaltstack\0"
735 "signal\0"
736 "signalfd\0"
737 "signalfd4\0"
738 "sigpending\0"
739 "sigprocmask\0"
740 "sigsuspend\0"
741 },
742 [SYSCALL_FILTER_SET_SWAP] = {
743 .name = "@swap",
744 .help = "Enable/disable swap devices",
745 .value =
746 "swapoff\0"
747 "swapon\0"
748 },
749 [SYSCALL_FILTER_SET_TIMER] = {
750 .name = "@timer",
751 .help = "Schedule operations by time",
752 .value =
753 "alarm\0"
754 "getitimer\0"
755 "setitimer\0"
756 "timer_create\0"
757 "timer_delete\0"
758 "timer_getoverrun\0"
759 "timer_gettime\0"
760 "timer_settime\0"
761 "timerfd_create\0"
762 "timerfd_gettime\0"
763 "timerfd_settime\0"
764 "times\0"
765 },
766 };
767
768 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
769 unsigned i;
770
771 if (isempty(name) || name[0] != '@')
772 return NULL;
773
774 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
775 if (streq(syscall_filter_sets[i].name, name))
776 return syscall_filter_sets + i;
777
778 return NULL;
779 }
780
781 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
782
783 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
784 int r;
785
786 assert(seccomp);
787 assert(name);
788
789 if (strv_contains(exclude, name))
790 return 0;
791
792 if (name[0] == '@') {
793 const SyscallFilterSet *other;
794
795 other = syscall_filter_set_find(name);
796 if (!other) {
797 log_debug("Filter set %s is not known!", name);
798 return -EINVAL;
799 }
800
801 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
802 if (r < 0)
803 return r;
804 } else {
805 int id;
806
807 id = seccomp_syscall_resolve_name(name);
808 if (id == __NR_SCMP_ERROR) {
809 log_debug("System call %s is not known, ignoring.", name);
810 return 0;
811 }
812
813 r = seccomp_rule_add_exact(seccomp, action, id, 0);
814 if (r < 0)
815 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
816 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
817 }
818
819 return 0;
820 }
821
822 static int seccomp_add_syscall_filter_set(
823 scmp_filter_ctx seccomp,
824 const SyscallFilterSet *set,
825 uint32_t action,
826 char **exclude) {
827
828 const char *sys;
829 int r;
830
831 assert(seccomp);
832 assert(set);
833
834 NULSTR_FOREACH(sys, set->value) {
835 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
836 if (r < 0)
837 return r;
838 }
839
840 return 0;
841 }
842
843 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
844 uint32_t arch;
845 int r;
846
847 assert(set);
848
849 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
850 * earch local arch. */
851
852 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
853 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
854
855 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
856
857 r = seccomp_init_for_arch(&seccomp, arch, default_action);
858 if (r < 0)
859 return r;
860
861 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
862 if (r < 0) {
863 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
864 continue;
865 }
866
867 r = seccomp_load(seccomp);
868 if (IN_SET(r, -EPERM, -EACCES))
869 return r;
870 if (r < 0)
871 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
872 }
873
874 return 0;
875 }
876
877 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
878 uint32_t arch;
879 int r;
880
881 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
882 * SyscallFilterSet* table. */
883
884 if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
885 return 0;
886
887 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
888 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
889 Iterator i;
890 void *id;
891
892 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
893
894 r = seccomp_init_for_arch(&seccomp, arch, default_action);
895 if (r < 0)
896 return r;
897
898 SET_FOREACH(id, set, i) {
899 r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
900 if (r < 0) {
901 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
902 _cleanup_free_ char *n = NULL;
903
904 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
905 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
906 }
907 }
908
909 r = seccomp_load(seccomp);
910 if (IN_SET(r, -EPERM, -EACCES))
911 return r;
912 if (r < 0)
913 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
914 }
915
916 return 0;
917 }
918
919 int seccomp_restrict_namespaces(unsigned long retain) {
920 uint32_t arch;
921 int r;
922
923 if (log_get_max_level() >= LOG_DEBUG) {
924 _cleanup_free_ char *s = NULL;
925
926 (void) namespace_flag_to_string_many(retain, &s);
927 log_debug("Restricting namespace to: %s.", strna(s));
928 }
929
930 /* NOOP? */
931 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
932 return 0;
933
934 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
935 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
936 unsigned i;
937
938 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
939
940 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
941 if (r < 0)
942 return r;
943
944 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
945 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
946 * altogether. */
947 r = seccomp_rule_add_exact(
948 seccomp,
949 SCMP_ACT_ERRNO(EPERM),
950 SCMP_SYS(setns),
951 0);
952 else
953 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
954 * special invocation with a zero flags argument, right here. */
955 r = seccomp_rule_add_exact(
956 seccomp,
957 SCMP_ACT_ERRNO(EPERM),
958 SCMP_SYS(setns),
959 1,
960 SCMP_A1(SCMP_CMP_EQ, 0));
961 if (r < 0) {
962 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
963 continue;
964 }
965
966 for (i = 0; namespace_flag_map[i].name; i++) {
967 unsigned long f;
968
969 f = namespace_flag_map[i].flag;
970 if ((retain & f) == f) {
971 log_debug("Permitting %s.", namespace_flag_map[i].name);
972 continue;
973 }
974
975 log_debug("Blocking %s.", namespace_flag_map[i].name);
976
977 r = seccomp_rule_add_exact(
978 seccomp,
979 SCMP_ACT_ERRNO(EPERM),
980 SCMP_SYS(unshare),
981 1,
982 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
983 if (r < 0) {
984 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
985 break;
986 }
987
988 /* On s390/s390x the first two parameters to clone are switched */
989 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
990 r = seccomp_rule_add_exact(
991 seccomp,
992 SCMP_ACT_ERRNO(EPERM),
993 SCMP_SYS(clone),
994 1,
995 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
996 else
997 r = seccomp_rule_add_exact(
998 seccomp,
999 SCMP_ACT_ERRNO(EPERM),
1000 SCMP_SYS(clone),
1001 1,
1002 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1003 if (r < 0) {
1004 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1005 break;
1006 }
1007
1008 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1009 r = seccomp_rule_add_exact(
1010 seccomp,
1011 SCMP_ACT_ERRNO(EPERM),
1012 SCMP_SYS(setns),
1013 1,
1014 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1015 if (r < 0) {
1016 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1017 break;
1018 }
1019 }
1020 }
1021 if (r < 0)
1022 continue;
1023
1024 r = seccomp_load(seccomp);
1025 if (IN_SET(r, -EPERM, -EACCES))
1026 return r;
1027 if (r < 0)
1028 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1029 }
1030
1031 return 0;
1032 }
1033
1034 int seccomp_protect_sysctl(void) {
1035 uint32_t arch;
1036 int r;
1037
1038 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1039 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1040
1041 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1042
1043 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1044 /* No _sysctl syscall */
1045 continue;
1046
1047 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1048 if (r < 0)
1049 return r;
1050
1051 r = seccomp_rule_add_exact(
1052 seccomp,
1053 SCMP_ACT_ERRNO(EPERM),
1054 SCMP_SYS(_sysctl),
1055 0);
1056 if (r < 0) {
1057 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1058 continue;
1059 }
1060
1061 r = seccomp_load(seccomp);
1062 if (IN_SET(r, -EPERM, -EACCES))
1063 return r;
1064 if (r < 0)
1065 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1066 }
1067
1068 return 0;
1069 }
1070
1071 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1072 uint32_t arch;
1073 int r;
1074
1075 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1076 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1077 bool supported;
1078 Iterator i;
1079
1080 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1081
1082 switch (arch) {
1083
1084 case SCMP_ARCH_X86_64:
1085 case SCMP_ARCH_X32:
1086 case SCMP_ARCH_ARM:
1087 case SCMP_ARCH_AARCH64:
1088 case SCMP_ARCH_PPC64:
1089 case SCMP_ARCH_PPC64LE:
1090 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1091 supported = true;
1092 break;
1093
1094 case SCMP_ARCH_S390:
1095 case SCMP_ARCH_S390X:
1096 case SCMP_ARCH_PPC:
1097 case SCMP_ARCH_X86:
1098 default:
1099 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1100 * don't know */
1101 supported = false;
1102 break;
1103 }
1104
1105 if (!supported)
1106 continue;
1107
1108 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1109 if (r < 0)
1110 return r;
1111
1112 if (whitelist) {
1113 int af, first = 0, last = 0;
1114 void *afp;
1115
1116 /* If this is a whitelist, we first block the address families that are out of range and then
1117 * everything that is not in the set. First, we find the lowest and highest address family in
1118 * the set. */
1119
1120 SET_FOREACH(afp, address_families, i) {
1121 af = PTR_TO_INT(afp);
1122
1123 if (af <= 0 || af >= af_max())
1124 continue;
1125
1126 if (first == 0 || af < first)
1127 first = af;
1128
1129 if (last == 0 || af > last)
1130 last = af;
1131 }
1132
1133 assert((first == 0) == (last == 0));
1134
1135 if (first == 0) {
1136
1137 /* No entries in the valid range, block everything */
1138 r = seccomp_rule_add_exact(
1139 seccomp,
1140 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1141 SCMP_SYS(socket),
1142 0);
1143 if (r < 0) {
1144 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1145 continue;
1146 }
1147
1148 } else {
1149
1150 /* Block everything below the first entry */
1151 r = seccomp_rule_add_exact(
1152 seccomp,
1153 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1154 SCMP_SYS(socket),
1155 1,
1156 SCMP_A0(SCMP_CMP_LT, first));
1157 if (r < 0) {
1158 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1159 continue;
1160 }
1161
1162 /* Block everything above the last entry */
1163 r = seccomp_rule_add_exact(
1164 seccomp,
1165 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1166 SCMP_SYS(socket),
1167 1,
1168 SCMP_A0(SCMP_CMP_GT, last));
1169 if (r < 0) {
1170 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1171 continue;
1172 }
1173
1174 /* Block everything between the first and last entry */
1175 for (af = 1; af < af_max(); af++) {
1176
1177 if (set_contains(address_families, INT_TO_PTR(af)))
1178 continue;
1179
1180 r = seccomp_rule_add_exact(
1181 seccomp,
1182 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1183 SCMP_SYS(socket),
1184 1,
1185 SCMP_A0(SCMP_CMP_EQ, af));
1186 if (r < 0)
1187 break;
1188 }
1189 if (r < 0) {
1190 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1191 continue;
1192 }
1193 }
1194
1195 } else {
1196 void *af;
1197
1198 /* If this is a blacklist, then generate one rule for
1199 * each address family that are then combined in OR
1200 * checks. */
1201
1202 SET_FOREACH(af, address_families, i) {
1203
1204 r = seccomp_rule_add_exact(
1205 seccomp,
1206 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1207 SCMP_SYS(socket),
1208 1,
1209 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1210 if (r < 0)
1211 break;
1212 }
1213 if (r < 0) {
1214 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1215 continue;
1216 }
1217 }
1218
1219 r = seccomp_load(seccomp);
1220 if (IN_SET(r, -EPERM, -EACCES))
1221 return r;
1222 if (r < 0)
1223 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1224 }
1225
1226 return 0;
1227 }
1228
1229 int seccomp_restrict_realtime(void) {
1230 static const int permitted_policies[] = {
1231 SCHED_OTHER,
1232 SCHED_BATCH,
1233 SCHED_IDLE,
1234 };
1235
1236 int r, max_policy = 0;
1237 uint32_t arch;
1238 unsigned i;
1239
1240 /* Determine the highest policy constant we want to allow */
1241 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1242 if (permitted_policies[i] > max_policy)
1243 max_policy = permitted_policies[i];
1244
1245 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1246 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1247 int p;
1248
1249 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1250
1251 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1252 if (r < 0)
1253 return r;
1254
1255 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1256 * whitelist. */
1257 for (p = 0; p < max_policy; p++) {
1258 bool good = false;
1259
1260 /* Check if this is in the whitelist. */
1261 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1262 if (permitted_policies[i] == p) {
1263 good = true;
1264 break;
1265 }
1266
1267 if (good)
1268 continue;
1269
1270 /* Deny this policy */
1271 r = seccomp_rule_add_exact(
1272 seccomp,
1273 SCMP_ACT_ERRNO(EPERM),
1274 SCMP_SYS(sched_setscheduler),
1275 1,
1276 SCMP_A1(SCMP_CMP_EQ, p));
1277 if (r < 0) {
1278 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1279 continue;
1280 }
1281 }
1282
1283 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1284 * unsigned here, hence no need no check for < 0 values. */
1285 r = seccomp_rule_add_exact(
1286 seccomp,
1287 SCMP_ACT_ERRNO(EPERM),
1288 SCMP_SYS(sched_setscheduler),
1289 1,
1290 SCMP_A1(SCMP_CMP_GT, max_policy));
1291 if (r < 0) {
1292 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1293 continue;
1294 }
1295
1296 r = seccomp_load(seccomp);
1297 if (IN_SET(r, -EPERM, -EACCES))
1298 return r;
1299 if (r < 0)
1300 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1301 }
1302
1303 return 0;
1304 }
1305
1306 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1307 uint32_t arch,
1308 int nr,
1309 unsigned int arg_cnt,
1310 const struct scmp_arg_cmp arg) {
1311 int r;
1312
1313 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1314 if (r < 0) {
1315 _cleanup_free_ char *n = NULL;
1316
1317 n = seccomp_syscall_resolve_num_arch(arch, nr);
1318 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1319 strna(n),
1320 seccomp_arch_to_string(arch));
1321 }
1322
1323 return r;
1324 }
1325
1326 /* For known architectures, check that syscalls are indeed defined or not. */
1327 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1328 assert_cc(SCMP_SYS(shmget) > 0);
1329 assert_cc(SCMP_SYS(shmat) > 0);
1330 assert_cc(SCMP_SYS(shmdt) > 0);
1331 #elif defined(__i386__) || defined(__powerpc64__)
1332 assert_cc(SCMP_SYS(shmget) < 0);
1333 assert_cc(SCMP_SYS(shmat) < 0);
1334 assert_cc(SCMP_SYS(shmdt) < 0);
1335 #endif
1336
1337 int seccomp_memory_deny_write_execute(void) {
1338
1339 uint32_t arch;
1340 int r;
1341
1342 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1343 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1344 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1345
1346 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1347
1348 switch (arch) {
1349
1350 case SCMP_ARCH_X86:
1351 filter_syscall = SCMP_SYS(mmap2);
1352 block_syscall = SCMP_SYS(mmap);
1353 break;
1354
1355 case SCMP_ARCH_PPC64:
1356 case SCMP_ARCH_PPC64LE:
1357 filter_syscall = SCMP_SYS(mmap);
1358
1359 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1360 * We ignore that here, which means there's still a way to get writable/executable
1361 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1362
1363 break;
1364
1365 case SCMP_ARCH_ARM:
1366 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1367 shmat_syscall = SCMP_SYS(shmat);
1368 break;
1369
1370 case SCMP_ARCH_X86_64:
1371 case SCMP_ARCH_X32:
1372 case SCMP_ARCH_AARCH64:
1373 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1374 shmat_syscall = SCMP_SYS(shmat);
1375 break;
1376
1377 /* Please add more definitions here, if you port systemd to other architectures! */
1378
1379 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1380 #warning "Consider adding the right mmap() syscall definitions here!"
1381 #endif
1382 }
1383
1384 /* Can't filter mmap() on this arch, then skip it */
1385 if (filter_syscall == 0)
1386 continue;
1387
1388 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1389 if (r < 0)
1390 return r;
1391
1392 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1393 1,
1394 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1395 if (r < 0)
1396 continue;
1397
1398 if (block_syscall != 0) {
1399 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1400 if (r < 0)
1401 continue;
1402 }
1403
1404 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1405 1,
1406 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1407 if (r < 0)
1408 continue;
1409
1410 if (shmat_syscall != 0) {
1411 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1412 1,
1413 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1414 if (r < 0)
1415 continue;
1416 }
1417
1418 r = seccomp_load(seccomp);
1419 if (IN_SET(r, -EPERM, -EACCES))
1420 return r;
1421 if (r < 0)
1422 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1423 }
1424
1425 return 0;
1426 }
1427
1428 int seccomp_restrict_archs(Set *archs) {
1429 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1430 Iterator i;
1431 void *id;
1432 int r;
1433
1434 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1435 * list. */
1436
1437 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1438 if (!seccomp)
1439 return -ENOMEM;
1440
1441 SET_FOREACH(id, archs, i) {
1442 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1443 if (r == -EEXIST)
1444 continue;
1445 if (r < 0)
1446 return r;
1447 }
1448
1449 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1450 if (r < 0)
1451 return r;
1452
1453 r = seccomp_load(seccomp);
1454 if (IN_SET(r, -EPERM, -EACCES))
1455 return r;
1456 if (r < 0)
1457 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1458
1459 return 0;
1460 }
1461
1462 int parse_syscall_archs(char **l, Set **archs) {
1463 _cleanup_set_free_ Set *_archs;
1464 char **s;
1465 int r;
1466
1467 assert(l);
1468 assert(archs);
1469
1470 r = set_ensure_allocated(&_archs, NULL);
1471 if (r < 0)
1472 return r;
1473
1474 STRV_FOREACH(s, l) {
1475 uint32_t a;
1476
1477 r = seccomp_arch_from_string(*s, &a);
1478 if (r < 0)
1479 return -EINVAL;
1480
1481 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1482 if (r < 0)
1483 return -ENOMEM;
1484 }
1485
1486 *archs = _archs;
1487 _archs = NULL;
1488
1489 return 0;
1490 }
1491
1492 int seccomp_filter_set_add(Set *filter, bool add, const SyscallFilterSet *set) {
1493 const char *i;
1494 int r;
1495
1496 assert(set);
1497
1498 NULSTR_FOREACH(i, set->value) {
1499
1500 if (i[0] == '@') {
1501 const SyscallFilterSet *more;
1502
1503 more = syscall_filter_set_find(i);
1504 if (!more)
1505 return -ENXIO;
1506
1507 r = seccomp_filter_set_add(filter, add, more);
1508 if (r < 0)
1509 return r;
1510 } else {
1511 int id;
1512
1513 id = seccomp_syscall_resolve_name(i);
1514 if (id == __NR_SCMP_ERROR) {
1515 log_debug("Couldn't resolve system call, ignoring: %s", i);
1516 continue;
1517 }
1518
1519 if (add) {
1520 r = set_put(filter, INT_TO_PTR(id + 1));
1521 if (r < 0)
1522 return r;
1523 } else
1524 (void) set_remove(filter, INT_TO_PTR(id + 1));
1525 }
1526 }
1527
1528 return 0;
1529 }
1530
1531 int seccomp_lock_personality(unsigned long personality) {
1532 uint32_t arch;
1533 int r;
1534
1535 if (personality >= PERSONALITY_INVALID)
1536 return -EINVAL;
1537
1538 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1539 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1540
1541 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1542 if (r < 0)
1543 return r;
1544
1545 r = seccomp_rule_add_exact(
1546 seccomp,
1547 SCMP_ACT_ERRNO(EPERM),
1548 SCMP_SYS(personality),
1549 1,
1550 SCMP_A0(SCMP_CMP_NE, personality));
1551 if (r < 0) {
1552 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1553 continue;
1554 }
1555
1556 r = seccomp_load(seccomp);
1557 if (IN_SET(r, -EPERM, -EACCES))
1558 return r;
1559 if (r < 0)
1560 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1561 }
1562
1563 return 0;
1564 }