]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #12635 from yuwata/nlmon-support
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <seccomp.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/prctl.h>
10 #include <sys/shm.h>
11 #include <sys/stat.h>
12
13 #include "af-list.h"
14 #include "alloc-util.h"
15 #include "errno-list.h"
16 #include "macro.h"
17 #include "nsflags.h"
18 #include "nulstr-util.h"
19 #include "process-util.h"
20 #include "seccomp-util.h"
21 #include "set.h"
22 #include "string-util.h"
23 #include "strv.h"
24
25 const uint32_t seccomp_local_archs[] = {
26
27 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
28
29 #if defined(__x86_64__) && defined(__ILP32__)
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
32 SCMP_ARCH_X32, /* native */
33 #elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
35 SCMP_ARCH_X32,
36 SCMP_ARCH_X86_64, /* native */
37 #elif defined(__i386__)
38 SCMP_ARCH_X86,
39 #elif defined(__aarch64__)
40 SCMP_ARCH_ARM,
41 SCMP_ARCH_AARCH64, /* native */
42 #elif defined(__arm__)
43 SCMP_ARCH_ARM,
44 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
48 SCMP_ARCH_MIPS,
49 SCMP_ARCH_MIPSEL, /* native */
50 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
54 SCMP_ARCH_MIPS64N32,
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
67 SCMP_ARCH_MIPSEL64,
68 SCMP_ARCH_MIPS64,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64N32, /* native */
71 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
79 SCMP_ARCH_PPC,
80 SCMP_ARCH_PPC64LE,
81 SCMP_ARCH_PPC64, /* native */
82 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86 #elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88 #elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91 #elif defined(__s390__)
92 SCMP_ARCH_S390,
93 #endif
94 (uint32_t) -1
95 };
96
97 const char* seccomp_arch_to_string(uint32_t c) {
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
102
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
105 return "native";
106 case SCMP_ARCH_X86:
107 return "x86";
108 case SCMP_ARCH_X86_64:
109 return "x86-64";
110 case SCMP_ARCH_X32:
111 return "x32";
112 case SCMP_ARCH_ARM:
113 return "arm";
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
135 return "s390";
136 case SCMP_ARCH_S390X:
137 return "s390x";
138 default:
139 return NULL;
140 }
141 }
142
143 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
183 else
184 return -EINVAL;
185
186 return 0;
187 }
188
189 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
190 scmp_filter_ctx seccomp;
191 int r;
192
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
204 if (r < 0)
205 goto finish;
206
207 r = seccomp_arch_add(seccomp, arch);
208 if (r < 0)
209 goto finish;
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
225 goto finish;
226
227 *ret = seccomp;
228 return 0;
229
230 finish:
231 seccomp_release(seccomp);
232 return r;
233 }
234
235 static bool is_basic_seccomp_available(void) {
236 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
237 }
238
239 static bool is_seccomp_filter_available(void) {
240 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 errno == EFAULT;
242 }
243
244 bool is_seccomp_available(void) {
245 static int cached_enabled = -1;
246
247 if (cached_enabled < 0)
248 cached_enabled =
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
251
252 return cached_enabled;
253 }
254
255 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
256 [SYSCALL_FILTER_SET_DEFAULT] = {
257 .name = "@default",
258 .help = "System calls that are always permitted",
259 .value =
260 "clock_getres\0"
261 "clock_gettime\0"
262 "clock_nanosleep\0"
263 "execve\0"
264 "exit\0"
265 "exit_group\0"
266 "futex\0"
267 "get_robust_list\0"
268 "get_thread_area\0"
269 "getegid\0"
270 "getegid32\0"
271 "geteuid\0"
272 "geteuid32\0"
273 "getgid\0"
274 "getgid32\0"
275 "getgroups\0"
276 "getgroups32\0"
277 "getpgid\0"
278 "getpgrp\0"
279 "getpid\0"
280 "getppid\0"
281 "getresgid\0"
282 "getresgid32\0"
283 "getresuid\0"
284 "getresuid32\0"
285 "getrlimit\0" /* make sure processes can query stack size and such */
286 "getsid\0"
287 "gettid\0"
288 "gettimeofday\0"
289 "getuid\0"
290 "getuid32\0"
291 "membarrier\0"
292 "nanosleep\0"
293 "pause\0"
294 "prlimit64\0"
295 "restart_syscall\0"
296 "rseq\0"
297 "rt_sigreturn\0"
298 "sched_yield\0"
299 "set_robust_list\0"
300 "set_thread_area\0"
301 "set_tid_address\0"
302 "set_tls\0"
303 "sigreturn\0"
304 "time\0"
305 "ugetrlimit\0"
306 },
307 [SYSCALL_FILTER_SET_AIO] = {
308 .name = "@aio",
309 .help = "Asynchronous IO",
310 .value =
311 "io_cancel\0"
312 "io_destroy\0"
313 "io_getevents\0"
314 "io_pgetevents\0"
315 "io_setup\0"
316 "io_submit\0"
317 },
318 [SYSCALL_FILTER_SET_BASIC_IO] = {
319 .name = "@basic-io",
320 .help = "Basic IO",
321 .value =
322 "_llseek\0"
323 "close\0"
324 "dup\0"
325 "dup2\0"
326 "dup3\0"
327 "lseek\0"
328 "pread64\0"
329 "preadv\0"
330 "preadv2\0"
331 "pwrite64\0"
332 "pwritev\0"
333 "pwritev2\0"
334 "read\0"
335 "readv\0"
336 "write\0"
337 "writev\0"
338 },
339 [SYSCALL_FILTER_SET_CHOWN] = {
340 .name = "@chown",
341 .help = "Change ownership of files and directories",
342 .value =
343 "chown\0"
344 "chown32\0"
345 "fchown\0"
346 "fchown32\0"
347 "fchownat\0"
348 "lchown\0"
349 "lchown32\0"
350 },
351 [SYSCALL_FILTER_SET_CLOCK] = {
352 .name = "@clock",
353 .help = "Change the system time",
354 .value =
355 "adjtimex\0"
356 "clock_adjtime\0"
357 "clock_settime\0"
358 "settimeofday\0"
359 "stime\0"
360 },
361 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
362 .name = "@cpu-emulation",
363 .help = "System calls for CPU emulation functionality",
364 .value =
365 "modify_ldt\0"
366 "subpage_prot\0"
367 "switch_endian\0"
368 "vm86\0"
369 "vm86old\0"
370 },
371 [SYSCALL_FILTER_SET_DEBUG] = {
372 .name = "@debug",
373 .help = "Debugging, performance monitoring and tracing functionality",
374 .value =
375 "lookup_dcookie\0"
376 "perf_event_open\0"
377 "ptrace\0"
378 "rtas\0"
379 #ifdef __NR_s390_runtime_instr
380 "s390_runtime_instr\0"
381 #endif
382 "sys_debug_setcontext\0"
383 },
384 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
385 .name = "@file-system",
386 .help = "File system operations",
387 .value =
388 "access\0"
389 "chdir\0"
390 "chmod\0"
391 "close\0"
392 "creat\0"
393 "faccessat\0"
394 "fallocate\0"
395 "fchdir\0"
396 "fchmod\0"
397 "fchmodat\0"
398 "fcntl\0"
399 "fcntl64\0"
400 "fgetxattr\0"
401 "flistxattr\0"
402 "fremovexattr\0"
403 "fsetxattr\0"
404 "fstat\0"
405 "fstat64\0"
406 "fstatat64\0"
407 "fstatfs\0"
408 "fstatfs64\0"
409 "ftruncate\0"
410 "ftruncate64\0"
411 "futimesat\0"
412 "getcwd\0"
413 "getdents\0"
414 "getdents64\0"
415 "getxattr\0"
416 "inotify_add_watch\0"
417 "inotify_init\0"
418 "inotify_init1\0"
419 "inotify_rm_watch\0"
420 "lgetxattr\0"
421 "link\0"
422 "linkat\0"
423 "listxattr\0"
424 "llistxattr\0"
425 "lremovexattr\0"
426 "lsetxattr\0"
427 "lstat\0"
428 "lstat64\0"
429 "mkdir\0"
430 "mkdirat\0"
431 "mknod\0"
432 "mknodat\0"
433 "mmap\0"
434 "mmap2\0"
435 "munmap\0"
436 "newfstatat\0"
437 "oldfstat\0"
438 "oldlstat\0"
439 "oldstat\0"
440 "open\0"
441 "openat\0"
442 "readlink\0"
443 "readlinkat\0"
444 "removexattr\0"
445 "rename\0"
446 "renameat\0"
447 "renameat2\0"
448 "rmdir\0"
449 "setxattr\0"
450 "stat\0"
451 "stat64\0"
452 "statfs\0"
453 "statfs64\0"
454 #ifdef __NR_statx
455 "statx\0"
456 #endif
457 "symlink\0"
458 "symlinkat\0"
459 "truncate\0"
460 "truncate64\0"
461 "unlink\0"
462 "unlinkat\0"
463 "utime\0"
464 "utimensat\0"
465 "utimes\0"
466 },
467 [SYSCALL_FILTER_SET_IO_EVENT] = {
468 .name = "@io-event",
469 .help = "Event loop system calls",
470 .value =
471 "_newselect\0"
472 "epoll_create\0"
473 "epoll_create1\0"
474 "epoll_ctl\0"
475 "epoll_ctl_old\0"
476 "epoll_pwait\0"
477 "epoll_wait\0"
478 "epoll_wait_old\0"
479 "eventfd\0"
480 "eventfd2\0"
481 "poll\0"
482 "ppoll\0"
483 "pselect6\0"
484 "select\0"
485 },
486 [SYSCALL_FILTER_SET_IPC] = {
487 .name = "@ipc",
488 .help = "SysV IPC, POSIX Message Queues or other IPC",
489 .value =
490 "ipc\0"
491 "memfd_create\0"
492 "mq_getsetattr\0"
493 "mq_notify\0"
494 "mq_open\0"
495 "mq_timedreceive\0"
496 "mq_timedsend\0"
497 "mq_unlink\0"
498 "msgctl\0"
499 "msgget\0"
500 "msgrcv\0"
501 "msgsnd\0"
502 "pipe\0"
503 "pipe2\0"
504 "process_vm_readv\0"
505 "process_vm_writev\0"
506 "semctl\0"
507 "semget\0"
508 "semop\0"
509 "semtimedop\0"
510 "shmat\0"
511 "shmctl\0"
512 "shmdt\0"
513 "shmget\0"
514 },
515 [SYSCALL_FILTER_SET_KEYRING] = {
516 .name = "@keyring",
517 .help = "Kernel keyring access",
518 .value =
519 "add_key\0"
520 "keyctl\0"
521 "request_key\0"
522 },
523 [SYSCALL_FILTER_SET_MEMLOCK] = {
524 .name = "@memlock",
525 .help = "Memory locking control",
526 .value =
527 "mlock\0"
528 "mlock2\0"
529 "mlockall\0"
530 "munlock\0"
531 "munlockall\0"
532 },
533 [SYSCALL_FILTER_SET_MODULE] = {
534 .name = "@module",
535 .help = "Loading and unloading of kernel modules",
536 .value =
537 "delete_module\0"
538 "finit_module\0"
539 "init_module\0"
540 },
541 [SYSCALL_FILTER_SET_MOUNT] = {
542 .name = "@mount",
543 .help = "Mounting and unmounting of file systems",
544 .value =
545 "chroot\0"
546 "mount\0"
547 "pivot_root\0"
548 "umount\0"
549 "umount2\0"
550 },
551 [SYSCALL_FILTER_SET_NETWORK_IO] = {
552 .name = "@network-io",
553 .help = "Network or Unix socket IO, should not be needed if not network facing",
554 .value =
555 "accept\0"
556 "accept4\0"
557 "bind\0"
558 "connect\0"
559 "getpeername\0"
560 "getsockname\0"
561 "getsockopt\0"
562 "listen\0"
563 "recv\0"
564 "recvfrom\0"
565 "recvmmsg\0"
566 "recvmsg\0"
567 "send\0"
568 "sendmmsg\0"
569 "sendmsg\0"
570 "sendto\0"
571 "setsockopt\0"
572 "shutdown\0"
573 "socket\0"
574 "socketcall\0"
575 "socketpair\0"
576 },
577 [SYSCALL_FILTER_SET_OBSOLETE] = {
578 /* some unknown even to libseccomp */
579 .name = "@obsolete",
580 .help = "Unusual, obsolete or unimplemented system calls",
581 .value =
582 "_sysctl\0"
583 "afs_syscall\0"
584 "bdflush\0"
585 "break\0"
586 "create_module\0"
587 "ftime\0"
588 "get_kernel_syms\0"
589 "getpmsg\0"
590 "gtty\0"
591 "idle\0"
592 "lock\0"
593 "mpx\0"
594 "prof\0"
595 "profil\0"
596 "putpmsg\0"
597 "query_module\0"
598 "security\0"
599 "sgetmask\0"
600 "ssetmask\0"
601 "stty\0"
602 "sysfs\0"
603 "tuxcall\0"
604 "ulimit\0"
605 "uselib\0"
606 "ustat\0"
607 "vserver\0"
608 },
609 [SYSCALL_FILTER_SET_PRIVILEGED] = {
610 .name = "@privileged",
611 .help = "All system calls which need super-user capabilities",
612 .value =
613 "@chown\0"
614 "@clock\0"
615 "@module\0"
616 "@raw-io\0"
617 "@reboot\0"
618 "@swap\0"
619 "_sysctl\0"
620 "acct\0"
621 "bpf\0"
622 "capset\0"
623 "chroot\0"
624 "fanotify_init\0"
625 "nfsservctl\0"
626 "open_by_handle_at\0"
627 "pivot_root\0"
628 "quotactl\0"
629 "setdomainname\0"
630 "setfsuid\0"
631 "setfsuid32\0"
632 "setgroups\0"
633 "setgroups32\0"
634 "sethostname\0"
635 "setresuid\0"
636 "setresuid32\0"
637 "setreuid\0"
638 "setreuid32\0"
639 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
640 "setuid32\0"
641 "vhangup\0"
642 },
643 [SYSCALL_FILTER_SET_PROCESS] = {
644 .name = "@process",
645 .help = "Process control, execution, namespaceing operations",
646 .value =
647 "arch_prctl\0"
648 "capget\0" /* Able to query arbitrary processes */
649 "clone\0"
650 "execveat\0"
651 "fork\0"
652 "getrusage\0"
653 "kill\0"
654 "pidfd_send_signal\0"
655 "prctl\0"
656 "rt_sigqueueinfo\0"
657 "rt_tgsigqueueinfo\0"
658 "setns\0"
659 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
660 "tgkill\0"
661 "times\0"
662 "tkill\0"
663 "unshare\0"
664 "vfork\0"
665 "wait4\0"
666 "waitid\0"
667 "waitpid\0"
668 },
669 [SYSCALL_FILTER_SET_RAW_IO] = {
670 .name = "@raw-io",
671 .help = "Raw I/O port access",
672 .value =
673 "ioperm\0"
674 "iopl\0"
675 "pciconfig_iobase\0"
676 "pciconfig_read\0"
677 "pciconfig_write\0"
678 #ifdef __NR_s390_pci_mmio_read
679 "s390_pci_mmio_read\0"
680 #endif
681 #ifdef __NR_s390_pci_mmio_write
682 "s390_pci_mmio_write\0"
683 #endif
684 },
685 [SYSCALL_FILTER_SET_REBOOT] = {
686 .name = "@reboot",
687 .help = "Reboot and reboot preparation/kexec",
688 .value =
689 "kexec_file_load\0"
690 "kexec_load\0"
691 "reboot\0"
692 },
693 [SYSCALL_FILTER_SET_RESOURCES] = {
694 .name = "@resources",
695 .help = "Alter resource settings",
696 .value =
697 "ioprio_set\0"
698 "mbind\0"
699 "migrate_pages\0"
700 "move_pages\0"
701 "nice\0"
702 "sched_setaffinity\0"
703 "sched_setattr\0"
704 "sched_setparam\0"
705 "sched_setscheduler\0"
706 "set_mempolicy\0"
707 "setpriority\0"
708 "setrlimit\0"
709 },
710 [SYSCALL_FILTER_SET_SETUID] = {
711 .name = "@setuid",
712 .help = "Operations for changing user/group credentials",
713 .value =
714 "setgid\0"
715 "setgid32\0"
716 "setgroups\0"
717 "setgroups32\0"
718 "setregid\0"
719 "setregid32\0"
720 "setresgid\0"
721 "setresgid32\0"
722 "setresuid\0"
723 "setresuid32\0"
724 "setreuid\0"
725 "setreuid32\0"
726 "setuid\0"
727 "setuid32\0"
728 },
729 [SYSCALL_FILTER_SET_SIGNAL] = {
730 .name = "@signal",
731 .help = "Process signal handling",
732 .value =
733 "rt_sigaction\0"
734 "rt_sigpending\0"
735 "rt_sigprocmask\0"
736 "rt_sigsuspend\0"
737 "rt_sigtimedwait\0"
738 "sigaction\0"
739 "sigaltstack\0"
740 "signal\0"
741 "signalfd\0"
742 "signalfd4\0"
743 "sigpending\0"
744 "sigprocmask\0"
745 "sigsuspend\0"
746 },
747 [SYSCALL_FILTER_SET_SWAP] = {
748 .name = "@swap",
749 .help = "Enable/disable swap devices",
750 .value =
751 "swapoff\0"
752 "swapon\0"
753 },
754 [SYSCALL_FILTER_SET_SYNC] = {
755 .name = "@sync",
756 .help = "Synchronize files and memory to storage",
757 .value =
758 "fdatasync\0"
759 "fsync\0"
760 "msync\0"
761 "sync\0"
762 "sync_file_range\0"
763 "syncfs\0"
764 },
765 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
766 .name = "@system-service",
767 .help = "General system service operations",
768 .value =
769 "@aio\0"
770 "@basic-io\0"
771 "@chown\0"
772 "@default\0"
773 "@file-system\0"
774 "@io-event\0"
775 "@ipc\0"
776 "@keyring\0"
777 "@memlock\0"
778 "@network-io\0"
779 "@process\0"
780 "@resources\0"
781 "@setuid\0"
782 "@signal\0"
783 "@sync\0"
784 "@timer\0"
785 "brk\0"
786 "capget\0"
787 "capset\0"
788 "copy_file_range\0"
789 "fadvise64\0"
790 "fadvise64_64\0"
791 "flock\0"
792 "get_mempolicy\0"
793 "getcpu\0"
794 "getpriority\0"
795 "getrandom\0"
796 "ioctl\0"
797 "ioprio_get\0"
798 "kcmp\0"
799 "madvise\0"
800 "mprotect\0"
801 "mremap\0"
802 "name_to_handle_at\0"
803 "oldolduname\0"
804 "olduname\0"
805 "personality\0"
806 "readahead\0"
807 "readdir\0"
808 "remap_file_pages\0"
809 "sched_get_priority_max\0"
810 "sched_get_priority_min\0"
811 "sched_getaffinity\0"
812 "sched_getattr\0"
813 "sched_getparam\0"
814 "sched_getscheduler\0"
815 "sched_rr_get_interval\0"
816 "sched_yield\0"
817 "sendfile\0"
818 "sendfile64\0"
819 "setfsgid\0"
820 "setfsgid32\0"
821 "setfsuid\0"
822 "setfsuid32\0"
823 "setpgid\0"
824 "setsid\0"
825 "splice\0"
826 "sysinfo\0"
827 "tee\0"
828 "umask\0"
829 "uname\0"
830 "userfaultfd\0"
831 "vmsplice\0"
832 },
833 [SYSCALL_FILTER_SET_TIMER] = {
834 .name = "@timer",
835 .help = "Schedule operations by time",
836 .value =
837 "alarm\0"
838 "getitimer\0"
839 "setitimer\0"
840 "timer_create\0"
841 "timer_delete\0"
842 "timer_getoverrun\0"
843 "timer_gettime\0"
844 "timer_settime\0"
845 "timerfd_create\0"
846 "timerfd_gettime\0"
847 "timerfd_settime\0"
848 "times\0"
849 },
850 };
851
852 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
853 unsigned i;
854
855 if (isempty(name) || name[0] != '@')
856 return NULL;
857
858 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
859 if (streq(syscall_filter_sets[i].name, name))
860 return syscall_filter_sets + i;
861
862 return NULL;
863 }
864
865 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
866
867 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
868 assert(seccomp);
869 assert(name);
870
871 if (strv_contains(exclude, name))
872 return 0;
873
874 if (name[0] == '@') {
875 const SyscallFilterSet *other;
876
877 other = syscall_filter_set_find(name);
878 if (!other)
879 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
880 "Filter set %s is not known!",
881 name);
882
883 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
884
885 } else {
886 int id, r;
887
888 id = seccomp_syscall_resolve_name(name);
889 if (id == __NR_SCMP_ERROR) {
890 if (log_missing)
891 log_debug("System call %s is not known, ignoring.", name);
892 return 0;
893 }
894
895 r = seccomp_rule_add_exact(seccomp, action, id, 0);
896 if (r < 0) {
897 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
898 bool ignore = r == -EDOM;
899
900 if (!ignore || log_missing)
901 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
902 name, id, ignore ? ", ignoring" : "");
903 if (!ignore)
904 return r;
905 }
906
907 return 0;
908 }
909 }
910
911 static int seccomp_add_syscall_filter_set(
912 scmp_filter_ctx seccomp,
913 const SyscallFilterSet *set,
914 uint32_t action,
915 char **exclude,
916 bool log_missing) {
917
918 const char *sys;
919 int r;
920
921 assert(seccomp);
922 assert(set);
923
924 NULSTR_FOREACH(sys, set->value) {
925 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
926 if (r < 0)
927 return r;
928 }
929
930 return 0;
931 }
932
933 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
934 uint32_t arch;
935 int r;
936
937 assert(set);
938
939 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
940 * each local arch. */
941
942 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
943 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
944
945 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
946
947 r = seccomp_init_for_arch(&seccomp, arch, default_action);
948 if (r < 0)
949 return r;
950
951 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
952 if (r < 0)
953 return log_debug_errno(r, "Failed to add filter set: %m");
954
955 r = seccomp_load(seccomp);
956 if (ERRNO_IS_SECCOMP_FATAL(r))
957 return r;
958 if (r < 0)
959 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
960 }
961
962 return 0;
963 }
964
965 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
966 uint32_t arch;
967 int r;
968
969 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
970 * SyscallFilterSet* table. */
971
972 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
973 return 0;
974
975 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
976 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
977 Iterator i;
978 void *syscall_id, *val;
979
980 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
981
982 r = seccomp_init_for_arch(&seccomp, arch, default_action);
983 if (r < 0)
984 return r;
985
986 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
987 uint32_t a = action;
988 int id = PTR_TO_INT(syscall_id) - 1;
989 int error = PTR_TO_INT(val);
990
991 if (action != SCMP_ACT_ALLOW && error >= 0)
992 a = SCMP_ACT_ERRNO(error);
993
994 r = seccomp_rule_add_exact(seccomp, a, id, 0);
995 if (r < 0) {
996 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
997 _cleanup_free_ char *n = NULL;
998 bool ignore;
999
1000 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1001 ignore = r == -EDOM;
1002 if (!ignore || log_missing)
1003 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1004 strna(n), id, ignore ? ", ignoring" : "");
1005 if (!ignore)
1006 return r;
1007 }
1008 }
1009
1010 r = seccomp_load(seccomp);
1011 if (ERRNO_IS_SECCOMP_FATAL(r))
1012 return r;
1013 if (r < 0)
1014 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1015 }
1016
1017 return 0;
1018 }
1019
1020 int seccomp_parse_syscall_filter(
1021 const char *name,
1022 int errno_num,
1023 Hashmap *filter,
1024 SeccompParseFlags flags,
1025 const char *unit,
1026 const char *filename,
1027 unsigned line) {
1028
1029 int r;
1030
1031 assert(name);
1032 assert(filter);
1033
1034 if (name[0] == '@') {
1035 const SyscallFilterSet *set;
1036 const char *i;
1037
1038 set = syscall_filter_set_find(name);
1039 if (!set) {
1040 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1041 return -EINVAL;
1042
1043 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1044 "Unknown system call group, ignoring: %s", name);
1045 return 0;
1046 }
1047
1048 NULSTR_FOREACH(i, set->value) {
1049 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1050 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1051 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1052 * about them. */
1053 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1054 if (r < 0)
1055 return r;
1056 }
1057 } else {
1058 int id;
1059
1060 id = seccomp_syscall_resolve_name(name);
1061 if (id == __NR_SCMP_ERROR) {
1062 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1063 return -EINVAL;
1064
1065 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1066 "Failed to parse system call, ignoring: %s", name);
1067 return 0;
1068 }
1069
1070 /* If we previously wanted to forbid a syscall and now
1071 * we want to allow it, then remove it from the list. */
1072 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1073 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1074 if (r < 0)
1075 switch (r) {
1076 case -ENOMEM:
1077 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1078 case -EEXIST:
1079 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1080 break;
1081 default:
1082 return r;
1083 }
1084 } else
1085 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1086 }
1087
1088 return 0;
1089 }
1090
1091 int seccomp_restrict_namespaces(unsigned long retain) {
1092 uint32_t arch;
1093 int r;
1094
1095 if (DEBUG_LOGGING) {
1096 _cleanup_free_ char *s = NULL;
1097
1098 (void) namespace_flags_to_string(retain, &s);
1099 log_debug("Restricting namespace to: %s.", strna(s));
1100 }
1101
1102 /* NOOP? */
1103 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1104 return 0;
1105
1106 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1107 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1108 unsigned i;
1109
1110 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1111
1112 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1113 if (r < 0)
1114 return r;
1115
1116 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1117 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1118 * altogether. */
1119 r = seccomp_rule_add_exact(
1120 seccomp,
1121 SCMP_ACT_ERRNO(EPERM),
1122 SCMP_SYS(setns),
1123 0);
1124 else
1125 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1126 * special invocation with a zero flags argument, right here. */
1127 r = seccomp_rule_add_exact(
1128 seccomp,
1129 SCMP_ACT_ERRNO(EPERM),
1130 SCMP_SYS(setns),
1131 1,
1132 SCMP_A1(SCMP_CMP_EQ, 0));
1133 if (r < 0) {
1134 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1135 continue;
1136 }
1137
1138 for (i = 0; namespace_flag_map[i].name; i++) {
1139 unsigned long f;
1140
1141 f = namespace_flag_map[i].flag;
1142 if ((retain & f) == f) {
1143 log_debug("Permitting %s.", namespace_flag_map[i].name);
1144 continue;
1145 }
1146
1147 log_debug("Blocking %s.", namespace_flag_map[i].name);
1148
1149 r = seccomp_rule_add_exact(
1150 seccomp,
1151 SCMP_ACT_ERRNO(EPERM),
1152 SCMP_SYS(unshare),
1153 1,
1154 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1155 if (r < 0) {
1156 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1157 break;
1158 }
1159
1160 /* On s390/s390x the first two parameters to clone are switched */
1161 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1162 r = seccomp_rule_add_exact(
1163 seccomp,
1164 SCMP_ACT_ERRNO(EPERM),
1165 SCMP_SYS(clone),
1166 1,
1167 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1168 else
1169 r = seccomp_rule_add_exact(
1170 seccomp,
1171 SCMP_ACT_ERRNO(EPERM),
1172 SCMP_SYS(clone),
1173 1,
1174 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1175 if (r < 0) {
1176 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1177 break;
1178 }
1179
1180 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1181 r = seccomp_rule_add_exact(
1182 seccomp,
1183 SCMP_ACT_ERRNO(EPERM),
1184 SCMP_SYS(setns),
1185 1,
1186 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1187 if (r < 0) {
1188 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1189 break;
1190 }
1191 }
1192 }
1193 if (r < 0)
1194 continue;
1195
1196 r = seccomp_load(seccomp);
1197 if (ERRNO_IS_SECCOMP_FATAL(r))
1198 return r;
1199 if (r < 0)
1200 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1201 }
1202
1203 return 0;
1204 }
1205
1206 int seccomp_protect_sysctl(void) {
1207 uint32_t arch;
1208 int r;
1209
1210 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1211 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1212
1213 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1214
1215 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1216 /* No _sysctl syscall */
1217 continue;
1218
1219 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1220 if (r < 0)
1221 return r;
1222
1223 r = seccomp_rule_add_exact(
1224 seccomp,
1225 SCMP_ACT_ERRNO(EPERM),
1226 SCMP_SYS(_sysctl),
1227 0);
1228 if (r < 0) {
1229 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1230 continue;
1231 }
1232
1233 r = seccomp_load(seccomp);
1234 if (ERRNO_IS_SECCOMP_FATAL(r))
1235 return r;
1236 if (r < 0)
1237 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1238 }
1239
1240 return 0;
1241 }
1242
1243 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1244 uint32_t arch;
1245 int r;
1246
1247 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1248 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1249 bool supported;
1250 Iterator i;
1251
1252 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1253
1254 switch (arch) {
1255
1256 case SCMP_ARCH_X86_64:
1257 case SCMP_ARCH_X32:
1258 case SCMP_ARCH_ARM:
1259 case SCMP_ARCH_AARCH64:
1260 case SCMP_ARCH_PPC:
1261 case SCMP_ARCH_PPC64:
1262 case SCMP_ARCH_PPC64LE:
1263 case SCMP_ARCH_MIPSEL64N32:
1264 case SCMP_ARCH_MIPS64N32:
1265 case SCMP_ARCH_MIPSEL64:
1266 case SCMP_ARCH_MIPS64:
1267 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1268 supported = true;
1269 break;
1270
1271 case SCMP_ARCH_S390:
1272 case SCMP_ARCH_S390X:
1273 case SCMP_ARCH_X86:
1274 case SCMP_ARCH_MIPSEL:
1275 case SCMP_ARCH_MIPS:
1276 default:
1277 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1278 * don't know */
1279 supported = false;
1280 break;
1281 }
1282
1283 if (!supported)
1284 continue;
1285
1286 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1287 if (r < 0)
1288 return r;
1289
1290 if (whitelist) {
1291 int af, first = 0, last = 0;
1292 void *afp;
1293
1294 /* If this is a whitelist, we first block the address families that are out of range and then
1295 * everything that is not in the set. First, we find the lowest and highest address family in
1296 * the set. */
1297
1298 SET_FOREACH(afp, address_families, i) {
1299 af = PTR_TO_INT(afp);
1300
1301 if (af <= 0 || af >= af_max())
1302 continue;
1303
1304 if (first == 0 || af < first)
1305 first = af;
1306
1307 if (last == 0 || af > last)
1308 last = af;
1309 }
1310
1311 assert((first == 0) == (last == 0));
1312
1313 if (first == 0) {
1314
1315 /* No entries in the valid range, block everything */
1316 r = seccomp_rule_add_exact(
1317 seccomp,
1318 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1319 SCMP_SYS(socket),
1320 0);
1321 if (r < 0) {
1322 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1323 continue;
1324 }
1325
1326 } else {
1327
1328 /* Block everything below the first entry */
1329 r = seccomp_rule_add_exact(
1330 seccomp,
1331 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1332 SCMP_SYS(socket),
1333 1,
1334 SCMP_A0(SCMP_CMP_LT, first));
1335 if (r < 0) {
1336 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1337 continue;
1338 }
1339
1340 /* Block everything above the last entry */
1341 r = seccomp_rule_add_exact(
1342 seccomp,
1343 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1344 SCMP_SYS(socket),
1345 1,
1346 SCMP_A0(SCMP_CMP_GT, last));
1347 if (r < 0) {
1348 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1349 continue;
1350 }
1351
1352 /* Block everything between the first and last entry */
1353 for (af = 1; af < af_max(); af++) {
1354
1355 if (set_contains(address_families, INT_TO_PTR(af)))
1356 continue;
1357
1358 r = seccomp_rule_add_exact(
1359 seccomp,
1360 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1361 SCMP_SYS(socket),
1362 1,
1363 SCMP_A0(SCMP_CMP_EQ, af));
1364 if (r < 0)
1365 break;
1366 }
1367 if (r < 0) {
1368 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1369 continue;
1370 }
1371 }
1372
1373 } else {
1374 void *af;
1375
1376 /* If this is a blacklist, then generate one rule for
1377 * each address family that are then combined in OR
1378 * checks. */
1379
1380 SET_FOREACH(af, address_families, i) {
1381
1382 r = seccomp_rule_add_exact(
1383 seccomp,
1384 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1385 SCMP_SYS(socket),
1386 1,
1387 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1388 if (r < 0)
1389 break;
1390 }
1391 if (r < 0) {
1392 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1393 continue;
1394 }
1395 }
1396
1397 r = seccomp_load(seccomp);
1398 if (ERRNO_IS_SECCOMP_FATAL(r))
1399 return r;
1400 if (r < 0)
1401 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1402 }
1403
1404 return 0;
1405 }
1406
1407 int seccomp_restrict_realtime(void) {
1408 static const int permitted_policies[] = {
1409 SCHED_OTHER,
1410 SCHED_BATCH,
1411 SCHED_IDLE,
1412 };
1413
1414 int r, max_policy = 0;
1415 uint32_t arch;
1416 unsigned i;
1417
1418 /* Determine the highest policy constant we want to allow */
1419 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1420 if (permitted_policies[i] > max_policy)
1421 max_policy = permitted_policies[i];
1422
1423 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1424 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1425 int p;
1426
1427 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1428
1429 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1430 if (r < 0)
1431 return r;
1432
1433 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1434 * whitelist. */
1435 for (p = 0; p < max_policy; p++) {
1436 bool good = false;
1437
1438 /* Check if this is in the whitelist. */
1439 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1440 if (permitted_policies[i] == p) {
1441 good = true;
1442 break;
1443 }
1444
1445 if (good)
1446 continue;
1447
1448 /* Deny this policy */
1449 r = seccomp_rule_add_exact(
1450 seccomp,
1451 SCMP_ACT_ERRNO(EPERM),
1452 SCMP_SYS(sched_setscheduler),
1453 1,
1454 SCMP_A1(SCMP_CMP_EQ, p));
1455 if (r < 0) {
1456 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1457 continue;
1458 }
1459 }
1460
1461 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1462 * unsigned here, hence no need no check for < 0 values. */
1463 r = seccomp_rule_add_exact(
1464 seccomp,
1465 SCMP_ACT_ERRNO(EPERM),
1466 SCMP_SYS(sched_setscheduler),
1467 1,
1468 SCMP_A1(SCMP_CMP_GT, max_policy));
1469 if (r < 0) {
1470 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1471 continue;
1472 }
1473
1474 r = seccomp_load(seccomp);
1475 if (ERRNO_IS_SECCOMP_FATAL(r))
1476 return r;
1477 if (r < 0)
1478 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1479 }
1480
1481 return 0;
1482 }
1483
1484 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1485 uint32_t arch,
1486 int nr,
1487 unsigned arg_cnt,
1488 const struct scmp_arg_cmp arg) {
1489 int r;
1490
1491 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1492 if (r < 0) {
1493 _cleanup_free_ char *n = NULL;
1494
1495 n = seccomp_syscall_resolve_num_arch(arch, nr);
1496 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1497 strna(n),
1498 seccomp_arch_to_string(arch));
1499 }
1500
1501 return r;
1502 }
1503
1504 /* For known architectures, check that syscalls are indeed defined or not. */
1505 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1506 assert_cc(SCMP_SYS(shmget) > 0);
1507 assert_cc(SCMP_SYS(shmat) > 0);
1508 assert_cc(SCMP_SYS(shmdt) > 0);
1509 #endif
1510
1511 int seccomp_memory_deny_write_execute(void) {
1512 uint32_t arch;
1513 int r;
1514
1515 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1516 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1517 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1518
1519 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1520
1521 switch (arch) {
1522
1523 case SCMP_ARCH_X86:
1524 filter_syscall = SCMP_SYS(mmap2);
1525 block_syscall = SCMP_SYS(mmap);
1526 shmat_syscall = SCMP_SYS(shmat);
1527 break;
1528
1529 case SCMP_ARCH_PPC:
1530 case SCMP_ARCH_PPC64:
1531 case SCMP_ARCH_PPC64LE:
1532 filter_syscall = SCMP_SYS(mmap);
1533
1534 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1535 * We ignore that here, which means there's still a way to get writable/executable
1536 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1537
1538 break;
1539
1540 case SCMP_ARCH_ARM:
1541 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1542 shmat_syscall = SCMP_SYS(shmat);
1543 break;
1544
1545 case SCMP_ARCH_X86_64:
1546 case SCMP_ARCH_X32:
1547 case SCMP_ARCH_AARCH64:
1548 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1549 shmat_syscall = SCMP_SYS(shmat);
1550 break;
1551
1552 /* Please add more definitions here, if you port systemd to other architectures! */
1553
1554 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1555 #warning "Consider adding the right mmap() syscall definitions here!"
1556 #endif
1557 }
1558
1559 /* Can't filter mmap() on this arch, then skip it */
1560 if (filter_syscall == 0)
1561 continue;
1562
1563 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1564 if (r < 0)
1565 return r;
1566
1567 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1568 1,
1569 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1570 if (r < 0)
1571 continue;
1572
1573 if (block_syscall != 0) {
1574 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1575 if (r < 0)
1576 continue;
1577 }
1578
1579 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1580 1,
1581 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1582 if (r < 0)
1583 continue;
1584
1585 #ifdef __NR_pkey_mprotect
1586 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1587 1,
1588 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1589 if (r < 0)
1590 continue;
1591 #endif
1592
1593 if (shmat_syscall > 0) {
1594 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1595 1,
1596 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1597 if (r < 0)
1598 continue;
1599 }
1600
1601 r = seccomp_load(seccomp);
1602 if (ERRNO_IS_SECCOMP_FATAL(r))
1603 return r;
1604 if (r < 0)
1605 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1606 }
1607
1608 return 0;
1609 }
1610
1611 int seccomp_restrict_archs(Set *archs) {
1612 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1613 Iterator i;
1614 void *id;
1615 int r;
1616
1617 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1618 * list.
1619 *
1620 * There are some qualifications. However the most important use is to stop processes from bypassing
1621 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1622 * in a non-native architecture. There are no holes in this use case, at least so far. */
1623
1624 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1625 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1626 * to run a program with the restrictions applied. */
1627 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1628 if (!seccomp)
1629 return -ENOMEM;
1630
1631 SET_FOREACH(id, archs, i) {
1632 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1633 if (r < 0 && r != -EEXIST)
1634 return r;
1635 }
1636
1637 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1638 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1639 * The important thing is that you can block the old 32-bit x86 syscalls.
1640 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1641
1642 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1643 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1644
1645 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1646 if (r < 0 && r != -EEXIST)
1647 return r;
1648 }
1649
1650 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1651 if (r < 0)
1652 return r;
1653
1654 r = seccomp_load(seccomp);
1655 if (ERRNO_IS_SECCOMP_FATAL(r))
1656 return r;
1657 if (r < 0)
1658 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1659
1660 return 0;
1661 }
1662
1663 int parse_syscall_archs(char **l, Set **archs) {
1664 _cleanup_set_free_ Set *_archs;
1665 char **s;
1666 int r;
1667
1668 assert(l);
1669 assert(archs);
1670
1671 r = set_ensure_allocated(&_archs, NULL);
1672 if (r < 0)
1673 return r;
1674
1675 STRV_FOREACH(s, l) {
1676 uint32_t a;
1677
1678 r = seccomp_arch_from_string(*s, &a);
1679 if (r < 0)
1680 return -EINVAL;
1681
1682 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1683 if (r < 0)
1684 return -ENOMEM;
1685 }
1686
1687 *archs = TAKE_PTR(_archs);
1688
1689 return 0;
1690 }
1691
1692 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1693 const char *i;
1694 int r;
1695
1696 assert(set);
1697
1698 NULSTR_FOREACH(i, set->value) {
1699
1700 if (i[0] == '@') {
1701 const SyscallFilterSet *more;
1702
1703 more = syscall_filter_set_find(i);
1704 if (!more)
1705 return -ENXIO;
1706
1707 r = seccomp_filter_set_add(filter, add, more);
1708 if (r < 0)
1709 return r;
1710 } else {
1711 int id;
1712
1713 id = seccomp_syscall_resolve_name(i);
1714 if (id == __NR_SCMP_ERROR) {
1715 log_debug("Couldn't resolve system call, ignoring: %s", i);
1716 continue;
1717 }
1718
1719 if (add) {
1720 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1721 if (r < 0)
1722 return r;
1723 } else
1724 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1725 }
1726 }
1727
1728 return 0;
1729 }
1730
1731 int seccomp_lock_personality(unsigned long personality) {
1732 uint32_t arch;
1733 int r;
1734
1735 if (personality >= PERSONALITY_INVALID)
1736 return -EINVAL;
1737
1738 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1739 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1740
1741 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1742 if (r < 0)
1743 return r;
1744
1745 r = seccomp_rule_add_exact(
1746 seccomp,
1747 SCMP_ACT_ERRNO(EPERM),
1748 SCMP_SYS(personality),
1749 1,
1750 SCMP_A0(SCMP_CMP_NE, personality));
1751 if (r < 0) {
1752 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1753 continue;
1754 }
1755
1756 r = seccomp_load(seccomp);
1757 if (ERRNO_IS_SECCOMP_FATAL(r))
1758 return r;
1759 if (r < 0)
1760 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1761 }
1762
1763 return 0;
1764 }
1765
1766 int seccomp_protect_hostname(void) {
1767 uint32_t arch;
1768 int r;
1769
1770 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1771 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1772
1773 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1774 if (r < 0)
1775 return r;
1776
1777 r = seccomp_rule_add_exact(
1778 seccomp,
1779 SCMP_ACT_ERRNO(EPERM),
1780 SCMP_SYS(sethostname),
1781 0);
1782 if (r < 0) {
1783 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1784 continue;
1785 }
1786
1787 r = seccomp_rule_add_exact(
1788 seccomp,
1789 SCMP_ACT_ERRNO(EPERM),
1790 SCMP_SYS(setdomainname),
1791 0);
1792 if (r < 0) {
1793 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1794 continue;
1795 }
1796
1797 r = seccomp_load(seccomp);
1798 if (ERRNO_IS_SECCOMP_FATAL(r))
1799 return r;
1800 if (r < 0)
1801 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1802 }
1803
1804 return 0;
1805 }
1806
1807 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1808 /* Checks the mode_t parameter of the following system calls:
1809 *
1810 * → chmod() + fchmod() + fchmodat()
1811 * → open() + creat() + openat()
1812 * → mkdir() + mkdirat()
1813 * → mknod() + mknodat()
1814 *
1815 * Returns error if *everything* failed, and 0 otherwise.
1816 */
1817 int r = 0;
1818 bool any = false;
1819
1820 r = seccomp_rule_add_exact(
1821 seccomp,
1822 SCMP_ACT_ERRNO(EPERM),
1823 SCMP_SYS(chmod),
1824 1,
1825 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1826 if (r < 0)
1827 log_debug_errno(r, "Failed to add filter for chmod: %m");
1828 else
1829 any = true;
1830
1831 r = seccomp_rule_add_exact(
1832 seccomp,
1833 SCMP_ACT_ERRNO(EPERM),
1834 SCMP_SYS(fchmod),
1835 1,
1836 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1837 if (r < 0)
1838 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1839 else
1840 any = true;
1841
1842 r = seccomp_rule_add_exact(
1843 seccomp,
1844 SCMP_ACT_ERRNO(EPERM),
1845 SCMP_SYS(fchmodat),
1846 1,
1847 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1848 if (r < 0)
1849 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1850 else
1851 any = true;
1852
1853 r = seccomp_rule_add_exact(
1854 seccomp,
1855 SCMP_ACT_ERRNO(EPERM),
1856 SCMP_SYS(mkdir),
1857 1,
1858 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1859 if (r < 0)
1860 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1861 else
1862 any = true;
1863
1864 r = seccomp_rule_add_exact(
1865 seccomp,
1866 SCMP_ACT_ERRNO(EPERM),
1867 SCMP_SYS(mkdirat),
1868 1,
1869 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1870 if (r < 0)
1871 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1872 else
1873 any = true;
1874
1875 r = seccomp_rule_add_exact(
1876 seccomp,
1877 SCMP_ACT_ERRNO(EPERM),
1878 SCMP_SYS(mknod),
1879 1,
1880 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1881 if (r < 0)
1882 log_debug_errno(r, "Failed to add filter for mknod: %m");
1883 else
1884 any = true;
1885
1886 r = seccomp_rule_add_exact(
1887 seccomp,
1888 SCMP_ACT_ERRNO(EPERM),
1889 SCMP_SYS(mknodat),
1890 1,
1891 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1892 if (r < 0)
1893 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1894 else
1895 any = true;
1896
1897 #if SCMP_SYS(open) > 0
1898 r = seccomp_rule_add_exact(
1899 seccomp,
1900 SCMP_ACT_ERRNO(EPERM),
1901 SCMP_SYS(open),
1902 2,
1903 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1904 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1905 if (r < 0)
1906 log_debug_errno(r, "Failed to add filter for open: %m");
1907 else
1908 any = true;
1909 #endif
1910
1911 r = seccomp_rule_add_exact(
1912 seccomp,
1913 SCMP_ACT_ERRNO(EPERM),
1914 SCMP_SYS(openat),
1915 2,
1916 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1917 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
1918 if (r < 0)
1919 log_debug_errno(r, "Failed to add filter for openat: %m");
1920 else
1921 any = true;
1922
1923 r = seccomp_rule_add_exact(
1924 seccomp,
1925 SCMP_ACT_ERRNO(EPERM),
1926 SCMP_SYS(creat),
1927 1,
1928 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1929 if (r < 0)
1930 log_debug_errno(r, "Failed to add filter for creat: %m");
1931 else
1932 any = true;
1933
1934 return any ? 0 : r;
1935 }
1936
1937 int seccomp_restrict_suid_sgid(void) {
1938 uint32_t arch;
1939 int r, k;
1940
1941 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1942 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1943
1944 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1945 if (r < 0)
1946 return r;
1947
1948 r = seccomp_restrict_sxid(seccomp, S_ISUID);
1949 if (r < 0)
1950 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1951
1952 k = seccomp_restrict_sxid(seccomp, S_ISGID);
1953 if (k < 0)
1954 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1955
1956 if (r < 0 && k < 0)
1957 continue;
1958
1959 r = seccomp_load(seccomp);
1960 if (ERRNO_IS_SECCOMP_FATAL(r))
1961 return r;
1962 if (r < 0)
1963 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1964 }
1965
1966 return 0;
1967 }
1968
1969 uint32_t scmp_act_kill_process(void) {
1970
1971 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
1972 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
1973 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
1974 * for single-threaded apps does the right thing. */
1975
1976 #ifdef SCMP_ACT_KILL_PROCESS
1977 if (seccomp_api_get() >= 3)
1978 return SCMP_ACT_KILL_PROCESS;
1979 #endif
1980
1981 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
1982 }