]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
systemctl: restore "systemctl reboot ARG" functionality
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <linux/seccomp.h>
5 #include <seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10
11 #include "af-list.h"
12 #include "alloc-util.h"
13 #include "errno-list.h"
14 #include "macro.h"
15 #include "nsflags.h"
16 #include "nulstr-util.h"
17 #include "process-util.h"
18 #include "seccomp-util.h"
19 #include "set.h"
20 #include "string-util.h"
21 #include "strv.h"
22
23 const uint32_t seccomp_local_archs[] = {
24
25 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
26
27 #if defined(__x86_64__) && defined(__ILP32__)
28 SCMP_ARCH_X86,
29 SCMP_ARCH_X86_64,
30 SCMP_ARCH_X32, /* native */
31 #elif defined(__x86_64__) && !defined(__ILP32__)
32 SCMP_ARCH_X86,
33 SCMP_ARCH_X32,
34 SCMP_ARCH_X86_64, /* native */
35 #elif defined(__i386__)
36 SCMP_ARCH_X86,
37 #elif defined(__aarch64__)
38 SCMP_ARCH_ARM,
39 SCMP_ARCH_AARCH64, /* native */
40 #elif defined(__arm__)
41 SCMP_ARCH_ARM,
42 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
43 SCMP_ARCH_MIPSEL,
44 SCMP_ARCH_MIPS, /* native */
45 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPS,
47 SCMP_ARCH_MIPSEL, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
49 SCMP_ARCH_MIPSEL,
50 SCMP_ARCH_MIPS,
51 SCMP_ARCH_MIPSEL64N32,
52 SCMP_ARCH_MIPS64N32,
53 SCMP_ARCH_MIPSEL64,
54 SCMP_ARCH_MIPS64, /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPS,
57 SCMP_ARCH_MIPSEL,
58 SCMP_ARCH_MIPS64N32,
59 SCMP_ARCH_MIPSEL64N32,
60 SCMP_ARCH_MIPS64,
61 SCMP_ARCH_MIPSEL64, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
63 SCMP_ARCH_MIPSEL,
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL64,
66 SCMP_ARCH_MIPS64,
67 SCMP_ARCH_MIPSEL64N32,
68 SCMP_ARCH_MIPS64N32, /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPS,
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS64,
73 SCMP_ARCH_MIPSEL64,
74 SCMP_ARCH_MIPS64N32,
75 SCMP_ARCH_MIPSEL64N32, /* native */
76 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
77 SCMP_ARCH_PPC,
78 SCMP_ARCH_PPC64LE,
79 SCMP_ARCH_PPC64, /* native */
80 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
81 SCMP_ARCH_PPC,
82 SCMP_ARCH_PPC64,
83 SCMP_ARCH_PPC64LE, /* native */
84 #elif defined(__powerpc__)
85 SCMP_ARCH_PPC,
86 #elif defined(__s390x__)
87 SCMP_ARCH_S390,
88 SCMP_ARCH_S390X, /* native */
89 #elif defined(__s390__)
90 SCMP_ARCH_S390,
91 #endif
92 (uint32_t) -1
93 };
94
95 const char* seccomp_arch_to_string(uint32_t c) {
96 /* Maintain order used in <seccomp.h>.
97 *
98 * Names used here should be the same as those used for ConditionArchitecture=,
99 * except for "subarchitectures" like x32. */
100
101 switch(c) {
102 case SCMP_ARCH_NATIVE:
103 return "native";
104 case SCMP_ARCH_X86:
105 return "x86";
106 case SCMP_ARCH_X86_64:
107 return "x86-64";
108 case SCMP_ARCH_X32:
109 return "x32";
110 case SCMP_ARCH_ARM:
111 return "arm";
112 case SCMP_ARCH_AARCH64:
113 return "arm64";
114 case SCMP_ARCH_MIPS:
115 return "mips";
116 case SCMP_ARCH_MIPS64:
117 return "mips64";
118 case SCMP_ARCH_MIPS64N32:
119 return "mips64-n32";
120 case SCMP_ARCH_MIPSEL:
121 return "mips-le";
122 case SCMP_ARCH_MIPSEL64:
123 return "mips64-le";
124 case SCMP_ARCH_MIPSEL64N32:
125 return "mips64-le-n32";
126 case SCMP_ARCH_PPC:
127 return "ppc";
128 case SCMP_ARCH_PPC64:
129 return "ppc64";
130 case SCMP_ARCH_PPC64LE:
131 return "ppc64-le";
132 case SCMP_ARCH_S390:
133 return "s390";
134 case SCMP_ARCH_S390X:
135 return "s390x";
136 default:
137 return NULL;
138 }
139 }
140
141 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
142 if (!n)
143 return -EINVAL;
144
145 assert(ret);
146
147 if (streq(n, "native"))
148 *ret = SCMP_ARCH_NATIVE;
149 else if (streq(n, "x86"))
150 *ret = SCMP_ARCH_X86;
151 else if (streq(n, "x86-64"))
152 *ret = SCMP_ARCH_X86_64;
153 else if (streq(n, "x32"))
154 *ret = SCMP_ARCH_X32;
155 else if (streq(n, "arm"))
156 *ret = SCMP_ARCH_ARM;
157 else if (streq(n, "arm64"))
158 *ret = SCMP_ARCH_AARCH64;
159 else if (streq(n, "mips"))
160 *ret = SCMP_ARCH_MIPS;
161 else if (streq(n, "mips64"))
162 *ret = SCMP_ARCH_MIPS64;
163 else if (streq(n, "mips64-n32"))
164 *ret = SCMP_ARCH_MIPS64N32;
165 else if (streq(n, "mips-le"))
166 *ret = SCMP_ARCH_MIPSEL;
167 else if (streq(n, "mips64-le"))
168 *ret = SCMP_ARCH_MIPSEL64;
169 else if (streq(n, "mips64-le-n32"))
170 *ret = SCMP_ARCH_MIPSEL64N32;
171 else if (streq(n, "ppc"))
172 *ret = SCMP_ARCH_PPC;
173 else if (streq(n, "ppc64"))
174 *ret = SCMP_ARCH_PPC64;
175 else if (streq(n, "ppc64-le"))
176 *ret = SCMP_ARCH_PPC64LE;
177 else if (streq(n, "s390"))
178 *ret = SCMP_ARCH_S390;
179 else if (streq(n, "s390x"))
180 *ret = SCMP_ARCH_S390X;
181 else
182 return -EINVAL;
183
184 return 0;
185 }
186
187 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
188 scmp_filter_ctx seccomp;
189 int r;
190
191 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
192 * any others. Also, turns off the NNP fiddling. */
193
194 seccomp = seccomp_init(default_action);
195 if (!seccomp)
196 return -ENOMEM;
197
198 if (arch != SCMP_ARCH_NATIVE &&
199 arch != seccomp_arch_native()) {
200
201 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
202 if (r < 0)
203 goto finish;
204
205 r = seccomp_arch_add(seccomp, arch);
206 if (r < 0)
207 goto finish;
208
209 assert(seccomp_arch_exist(seccomp, arch) >= 0);
210 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
211 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
212 } else {
213 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
214 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
215 }
216
217 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
218 if (r < 0)
219 goto finish;
220
221 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
222 if (r < 0)
223 goto finish;
224
225 *ret = seccomp;
226 return 0;
227
228 finish:
229 seccomp_release(seccomp);
230 return r;
231 }
232
233 static bool is_basic_seccomp_available(void) {
234 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
235 }
236
237 static bool is_seccomp_filter_available(void) {
238 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
239 errno == EFAULT;
240 }
241
242 bool is_seccomp_available(void) {
243 static int cached_enabled = -1;
244
245 if (cached_enabled < 0)
246 cached_enabled =
247 is_basic_seccomp_available() &&
248 is_seccomp_filter_available();
249
250 return cached_enabled;
251 }
252
253 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
254 [SYSCALL_FILTER_SET_DEFAULT] = {
255 .name = "@default",
256 .help = "System calls that are always permitted",
257 .value =
258 "clock_getres\0"
259 "clock_gettime\0"
260 "clock_nanosleep\0"
261 "execve\0"
262 "exit\0"
263 "exit_group\0"
264 "futex\0"
265 "get_robust_list\0"
266 "get_thread_area\0"
267 "getegid\0"
268 "getegid32\0"
269 "geteuid\0"
270 "geteuid32\0"
271 "getgid\0"
272 "getgid32\0"
273 "getgroups\0"
274 "getgroups32\0"
275 "getpgid\0"
276 "getpgrp\0"
277 "getpid\0"
278 "getppid\0"
279 "getresgid\0"
280 "getresgid32\0"
281 "getresuid\0"
282 "getresuid32\0"
283 "getrlimit\0" /* make sure processes can query stack size and such */
284 "getsid\0"
285 "gettid\0"
286 "gettimeofday\0"
287 "getuid\0"
288 "getuid32\0"
289 "membarrier\0"
290 "nanosleep\0"
291 "pause\0"
292 "prlimit64\0"
293 "restart_syscall\0"
294 "rt_sigreturn\0"
295 "sched_yield\0"
296 "set_robust_list\0"
297 "set_thread_area\0"
298 "set_tid_address\0"
299 "set_tls\0"
300 "sigreturn\0"
301 "time\0"
302 "ugetrlimit\0"
303 },
304 [SYSCALL_FILTER_SET_AIO] = {
305 .name = "@aio",
306 .help = "Asynchronous IO",
307 .value =
308 "io_cancel\0"
309 "io_destroy\0"
310 "io_getevents\0"
311 "io_pgetevents\0"
312 "io_setup\0"
313 "io_submit\0"
314 },
315 [SYSCALL_FILTER_SET_BASIC_IO] = {
316 .name = "@basic-io",
317 .help = "Basic IO",
318 .value =
319 "_llseek\0"
320 "close\0"
321 "dup\0"
322 "dup2\0"
323 "dup3\0"
324 "lseek\0"
325 "pread64\0"
326 "preadv\0"
327 "preadv2\0"
328 "pwrite64\0"
329 "pwritev\0"
330 "pwritev2\0"
331 "read\0"
332 "readv\0"
333 "write\0"
334 "writev\0"
335 },
336 [SYSCALL_FILTER_SET_CHOWN] = {
337 .name = "@chown",
338 .help = "Change ownership of files and directories",
339 .value =
340 "chown\0"
341 "chown32\0"
342 "fchown\0"
343 "fchown32\0"
344 "fchownat\0"
345 "lchown\0"
346 "lchown32\0"
347 },
348 [SYSCALL_FILTER_SET_CLOCK] = {
349 .name = "@clock",
350 .help = "Change the system time",
351 .value =
352 "adjtimex\0"
353 "clock_adjtime\0"
354 "clock_settime\0"
355 "settimeofday\0"
356 "stime\0"
357 },
358 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
359 .name = "@cpu-emulation",
360 .help = "System calls for CPU emulation functionality",
361 .value =
362 "modify_ldt\0"
363 "subpage_prot\0"
364 "switch_endian\0"
365 "vm86\0"
366 "vm86old\0"
367 },
368 [SYSCALL_FILTER_SET_DEBUG] = {
369 .name = "@debug",
370 .help = "Debugging, performance monitoring and tracing functionality",
371 .value =
372 "lookup_dcookie\0"
373 "perf_event_open\0"
374 "ptrace\0"
375 "rtas\0"
376 #ifdef __NR_s390_runtime_instr
377 "s390_runtime_instr\0"
378 #endif
379 "sys_debug_setcontext\0"
380 },
381 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
382 .name = "@file-system",
383 .help = "File system operations",
384 .value =
385 "access\0"
386 "chdir\0"
387 "chmod\0"
388 "close\0"
389 "creat\0"
390 "faccessat\0"
391 "fallocate\0"
392 "fchdir\0"
393 "fchmod\0"
394 "fchmodat\0"
395 "fcntl\0"
396 "fcntl64\0"
397 "fgetxattr\0"
398 "flistxattr\0"
399 "fremovexattr\0"
400 "fsetxattr\0"
401 "fstat\0"
402 "fstat64\0"
403 "fstatat64\0"
404 "fstatfs\0"
405 "fstatfs64\0"
406 "ftruncate\0"
407 "ftruncate64\0"
408 "futimesat\0"
409 "getcwd\0"
410 "getdents\0"
411 "getdents64\0"
412 "getxattr\0"
413 "inotify_add_watch\0"
414 "inotify_init\0"
415 "inotify_init1\0"
416 "inotify_rm_watch\0"
417 "lgetxattr\0"
418 "link\0"
419 "linkat\0"
420 "listxattr\0"
421 "llistxattr\0"
422 "lremovexattr\0"
423 "lsetxattr\0"
424 "lstat\0"
425 "lstat64\0"
426 "mkdir\0"
427 "mkdirat\0"
428 "mknod\0"
429 "mknodat\0"
430 "mmap\0"
431 "mmap2\0"
432 "munmap\0"
433 "newfstatat\0"
434 "oldfstat\0"
435 "oldlstat\0"
436 "oldstat\0"
437 "open\0"
438 "openat\0"
439 "readlink\0"
440 "readlinkat\0"
441 "removexattr\0"
442 "rename\0"
443 "renameat\0"
444 "renameat2\0"
445 "rmdir\0"
446 "setxattr\0"
447 "stat\0"
448 "stat64\0"
449 "statfs\0"
450 "statfs64\0"
451 #ifdef __NR_statx
452 "statx\0"
453 #endif
454 "symlink\0"
455 "symlinkat\0"
456 "truncate\0"
457 "truncate64\0"
458 "unlink\0"
459 "unlinkat\0"
460 "utime\0"
461 "utimensat\0"
462 "utimes\0"
463 },
464 [SYSCALL_FILTER_SET_IO_EVENT] = {
465 .name = "@io-event",
466 .help = "Event loop system calls",
467 .value =
468 "_newselect\0"
469 "epoll_create\0"
470 "epoll_create1\0"
471 "epoll_ctl\0"
472 "epoll_ctl_old\0"
473 "epoll_pwait\0"
474 "epoll_wait\0"
475 "epoll_wait_old\0"
476 "eventfd\0"
477 "eventfd2\0"
478 "poll\0"
479 "ppoll\0"
480 "pselect6\0"
481 "select\0"
482 },
483 [SYSCALL_FILTER_SET_IPC] = {
484 .name = "@ipc",
485 .help = "SysV IPC, POSIX Message Queues or other IPC",
486 .value =
487 "ipc\0"
488 "memfd_create\0"
489 "mq_getsetattr\0"
490 "mq_notify\0"
491 "mq_open\0"
492 "mq_timedreceive\0"
493 "mq_timedsend\0"
494 "mq_unlink\0"
495 "msgctl\0"
496 "msgget\0"
497 "msgrcv\0"
498 "msgsnd\0"
499 "pipe\0"
500 "pipe2\0"
501 "process_vm_readv\0"
502 "process_vm_writev\0"
503 "semctl\0"
504 "semget\0"
505 "semop\0"
506 "semtimedop\0"
507 "shmat\0"
508 "shmctl\0"
509 "shmdt\0"
510 "shmget\0"
511 },
512 [SYSCALL_FILTER_SET_KEYRING] = {
513 .name = "@keyring",
514 .help = "Kernel keyring access",
515 .value =
516 "add_key\0"
517 "keyctl\0"
518 "request_key\0"
519 },
520 [SYSCALL_FILTER_SET_MEMLOCK] = {
521 .name = "@memlock",
522 .help = "Memory locking control",
523 .value =
524 "mlock\0"
525 "mlock2\0"
526 "mlockall\0"
527 "munlock\0"
528 "munlockall\0"
529 },
530 [SYSCALL_FILTER_SET_MODULE] = {
531 .name = "@module",
532 .help = "Loading and unloading of kernel modules",
533 .value =
534 "delete_module\0"
535 "finit_module\0"
536 "init_module\0"
537 },
538 [SYSCALL_FILTER_SET_MOUNT] = {
539 .name = "@mount",
540 .help = "Mounting and unmounting of file systems",
541 .value =
542 "chroot\0"
543 "mount\0"
544 "pivot_root\0"
545 "umount\0"
546 "umount2\0"
547 },
548 [SYSCALL_FILTER_SET_NETWORK_IO] = {
549 .name = "@network-io",
550 .help = "Network or Unix socket IO, should not be needed if not network facing",
551 .value =
552 "accept\0"
553 "accept4\0"
554 "bind\0"
555 "connect\0"
556 "getpeername\0"
557 "getsockname\0"
558 "getsockopt\0"
559 "listen\0"
560 "recv\0"
561 "recvfrom\0"
562 "recvmmsg\0"
563 "recvmsg\0"
564 "send\0"
565 "sendmmsg\0"
566 "sendmsg\0"
567 "sendto\0"
568 "setsockopt\0"
569 "shutdown\0"
570 "socket\0"
571 "socketcall\0"
572 "socketpair\0"
573 },
574 [SYSCALL_FILTER_SET_OBSOLETE] = {
575 /* some unknown even to libseccomp */
576 .name = "@obsolete",
577 .help = "Unusual, obsolete or unimplemented system calls",
578 .value =
579 "_sysctl\0"
580 "afs_syscall\0"
581 "bdflush\0"
582 "break\0"
583 "create_module\0"
584 "ftime\0"
585 "get_kernel_syms\0"
586 "getpmsg\0"
587 "gtty\0"
588 "idle\0"
589 "lock\0"
590 "mpx\0"
591 "prof\0"
592 "profil\0"
593 "putpmsg\0"
594 "query_module\0"
595 "security\0"
596 "sgetmask\0"
597 "ssetmask\0"
598 "stty\0"
599 "sysfs\0"
600 "tuxcall\0"
601 "ulimit\0"
602 "uselib\0"
603 "ustat\0"
604 "vserver\0"
605 },
606 [SYSCALL_FILTER_SET_PRIVILEGED] = {
607 .name = "@privileged",
608 .help = "All system calls which need super-user capabilities",
609 .value =
610 "@chown\0"
611 "@clock\0"
612 "@module\0"
613 "@raw-io\0"
614 "@reboot\0"
615 "@swap\0"
616 "_sysctl\0"
617 "acct\0"
618 "bpf\0"
619 "capset\0"
620 "chroot\0"
621 "fanotify_init\0"
622 "nfsservctl\0"
623 "open_by_handle_at\0"
624 "pivot_root\0"
625 "quotactl\0"
626 "setdomainname\0"
627 "setfsuid\0"
628 "setfsuid32\0"
629 "setgroups\0"
630 "setgroups32\0"
631 "sethostname\0"
632 "setresuid\0"
633 "setresuid32\0"
634 "setreuid\0"
635 "setreuid32\0"
636 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
637 "setuid32\0"
638 "vhangup\0"
639 },
640 [SYSCALL_FILTER_SET_PROCESS] = {
641 .name = "@process",
642 .help = "Process control, execution, namespaceing operations",
643 .value =
644 "arch_prctl\0"
645 "capget\0" /* Able to query arbitrary processes */
646 "clone\0"
647 "execveat\0"
648 "fork\0"
649 "getrusage\0"
650 "kill\0"
651 "prctl\0"
652 "rt_sigqueueinfo\0"
653 "rt_tgsigqueueinfo\0"
654 "setns\0"
655 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
656 "tgkill\0"
657 "times\0"
658 "tkill\0"
659 "unshare\0"
660 "vfork\0"
661 "wait4\0"
662 "waitid\0"
663 "waitpid\0"
664 },
665 [SYSCALL_FILTER_SET_RAW_IO] = {
666 .name = "@raw-io",
667 .help = "Raw I/O port access",
668 .value =
669 "ioperm\0"
670 "iopl\0"
671 "pciconfig_iobase\0"
672 "pciconfig_read\0"
673 "pciconfig_write\0"
674 #ifdef __NR_s390_pci_mmio_read
675 "s390_pci_mmio_read\0"
676 #endif
677 #ifdef __NR_s390_pci_mmio_write
678 "s390_pci_mmio_write\0"
679 #endif
680 },
681 [SYSCALL_FILTER_SET_REBOOT] = {
682 .name = "@reboot",
683 .help = "Reboot and reboot preparation/kexec",
684 .value =
685 "kexec_file_load\0"
686 "kexec_load\0"
687 "reboot\0"
688 },
689 [SYSCALL_FILTER_SET_RESOURCES] = {
690 .name = "@resources",
691 .help = "Alter resource settings",
692 .value =
693 "ioprio_set\0"
694 "mbind\0"
695 "migrate_pages\0"
696 "move_pages\0"
697 "nice\0"
698 "sched_setaffinity\0"
699 "sched_setattr\0"
700 "sched_setparam\0"
701 "sched_setscheduler\0"
702 "set_mempolicy\0"
703 "setpriority\0"
704 "setrlimit\0"
705 },
706 [SYSCALL_FILTER_SET_SETUID] = {
707 .name = "@setuid",
708 .help = "Operations for changing user/group credentials",
709 .value =
710 "setgid\0"
711 "setgid32\0"
712 "setgroups\0"
713 "setgroups32\0"
714 "setregid\0"
715 "setregid32\0"
716 "setresgid\0"
717 "setresgid32\0"
718 "setresuid\0"
719 "setresuid32\0"
720 "setreuid\0"
721 "setreuid32\0"
722 "setuid\0"
723 "setuid32\0"
724 },
725 [SYSCALL_FILTER_SET_SIGNAL] = {
726 .name = "@signal",
727 .help = "Process signal handling",
728 .value =
729 "rt_sigaction\0"
730 "rt_sigpending\0"
731 "rt_sigprocmask\0"
732 "rt_sigsuspend\0"
733 "rt_sigtimedwait\0"
734 "sigaction\0"
735 "sigaltstack\0"
736 "signal\0"
737 "signalfd\0"
738 "signalfd4\0"
739 "sigpending\0"
740 "sigprocmask\0"
741 "sigsuspend\0"
742 },
743 [SYSCALL_FILTER_SET_SWAP] = {
744 .name = "@swap",
745 .help = "Enable/disable swap devices",
746 .value =
747 "swapoff\0"
748 "swapon\0"
749 },
750 [SYSCALL_FILTER_SET_SYNC] = {
751 .name = "@sync",
752 .help = "Synchronize files and memory to storage",
753 .value =
754 "fdatasync\0"
755 "fsync\0"
756 "msync\0"
757 "sync\0"
758 "sync_file_range\0"
759 "syncfs\0"
760 },
761 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
762 .name = "@system-service",
763 .help = "General system service operations",
764 .value =
765 "@aio\0"
766 "@basic-io\0"
767 "@chown\0"
768 "@default\0"
769 "@file-system\0"
770 "@io-event\0"
771 "@ipc\0"
772 "@keyring\0"
773 "@memlock\0"
774 "@network-io\0"
775 "@process\0"
776 "@resources\0"
777 "@setuid\0"
778 "@signal\0"
779 "@sync\0"
780 "@timer\0"
781 "brk\0"
782 "capget\0"
783 "capset\0"
784 "copy_file_range\0"
785 "fadvise64\0"
786 "fadvise64_64\0"
787 "flock\0"
788 "get_mempolicy\0"
789 "getcpu\0"
790 "getpriority\0"
791 "getrandom\0"
792 "ioctl\0"
793 "ioprio_get\0"
794 "kcmp\0"
795 "madvise\0"
796 "mprotect\0"
797 "mremap\0"
798 "name_to_handle_at\0"
799 "oldolduname\0"
800 "olduname\0"
801 "personality\0"
802 "readahead\0"
803 "readdir\0"
804 "remap_file_pages\0"
805 "sched_get_priority_max\0"
806 "sched_get_priority_min\0"
807 "sched_getaffinity\0"
808 "sched_getattr\0"
809 "sched_getparam\0"
810 "sched_getscheduler\0"
811 "sched_rr_get_interval\0"
812 "sched_yield\0"
813 "sendfile\0"
814 "sendfile64\0"
815 "setfsgid\0"
816 "setfsgid32\0"
817 "setfsuid\0"
818 "setfsuid32\0"
819 "setpgid\0"
820 "setsid\0"
821 "splice\0"
822 "sysinfo\0"
823 "tee\0"
824 "umask\0"
825 "uname\0"
826 "userfaultfd\0"
827 "vmsplice\0"
828 },
829 [SYSCALL_FILTER_SET_TIMER] = {
830 .name = "@timer",
831 .help = "Schedule operations by time",
832 .value =
833 "alarm\0"
834 "getitimer\0"
835 "setitimer\0"
836 "timer_create\0"
837 "timer_delete\0"
838 "timer_getoverrun\0"
839 "timer_gettime\0"
840 "timer_settime\0"
841 "timerfd_create\0"
842 "timerfd_gettime\0"
843 "timerfd_settime\0"
844 "times\0"
845 },
846 };
847
848 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
849 unsigned i;
850
851 if (isempty(name) || name[0] != '@')
852 return NULL;
853
854 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
855 if (streq(syscall_filter_sets[i].name, name))
856 return syscall_filter_sets + i;
857
858 return NULL;
859 }
860
861 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
862
863 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
864 assert(seccomp);
865 assert(name);
866
867 if (strv_contains(exclude, name))
868 return 0;
869
870 if (name[0] == '@') {
871 const SyscallFilterSet *other;
872
873 other = syscall_filter_set_find(name);
874 if (!other)
875 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
876 "Filter set %s is not known!",
877 name);
878
879 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
880
881 } else {
882 int id, r;
883
884 id = seccomp_syscall_resolve_name(name);
885 if (id == __NR_SCMP_ERROR) {
886 if (log_missing)
887 log_debug("System call %s is not known, ignoring.", name);
888 return 0;
889 }
890
891 r = seccomp_rule_add_exact(seccomp, action, id, 0);
892 if (r < 0) {
893 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
894 bool ignore = r == -EDOM;
895
896 if (!ignore || log_missing)
897 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
898 name, id, ignore ? ", ignoring" : "");
899 if (!ignore)
900 return r;
901 }
902
903 return 0;
904 }
905 }
906
907 static int seccomp_add_syscall_filter_set(
908 scmp_filter_ctx seccomp,
909 const SyscallFilterSet *set,
910 uint32_t action,
911 char **exclude,
912 bool log_missing) {
913
914 const char *sys;
915 int r;
916
917 assert(seccomp);
918 assert(set);
919
920 NULSTR_FOREACH(sys, set->value) {
921 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
922 if (r < 0)
923 return r;
924 }
925
926 return 0;
927 }
928
929 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
930 uint32_t arch;
931 int r;
932
933 assert(set);
934
935 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
936 * each local arch. */
937
938 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
939 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
940
941 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
942
943 r = seccomp_init_for_arch(&seccomp, arch, default_action);
944 if (r < 0)
945 return r;
946
947 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
948 if (r < 0)
949 return log_debug_errno(r, "Failed to add filter set: %m");
950
951 r = seccomp_load(seccomp);
952 if (IN_SET(r, -EPERM, -EACCES))
953 return r;
954 if (r < 0)
955 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
956 }
957
958 return 0;
959 }
960
961 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
962 uint32_t arch;
963 int r;
964
965 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
966 * SyscallFilterSet* table. */
967
968 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
969 return 0;
970
971 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
972 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
973 Iterator i;
974 void *syscall_id, *val;
975
976 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
977
978 r = seccomp_init_for_arch(&seccomp, arch, default_action);
979 if (r < 0)
980 return r;
981
982 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
983 uint32_t a = action;
984 int id = PTR_TO_INT(syscall_id) - 1;
985 int error = PTR_TO_INT(val);
986
987 if (action != SCMP_ACT_ALLOW && error >= 0)
988 a = SCMP_ACT_ERRNO(error);
989
990 r = seccomp_rule_add_exact(seccomp, a, id, 0);
991 if (r < 0) {
992 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
993 _cleanup_free_ char *n = NULL;
994 bool ignore;
995
996 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
997 ignore = r == -EDOM;
998 if (!ignore || log_missing)
999 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1000 strna(n), id, ignore ? ", ignoring" : "");
1001 if (!ignore)
1002 return r;
1003 }
1004 }
1005
1006 r = seccomp_load(seccomp);
1007 if (IN_SET(r, -EPERM, -EACCES))
1008 return r;
1009 if (r < 0)
1010 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1011 }
1012
1013 return 0;
1014 }
1015
1016 int seccomp_parse_syscall_filter_full(
1017 const char *name,
1018 int errno_num,
1019 Hashmap *filter,
1020 SeccompParseFlags flags,
1021 const char *unit,
1022 const char *filename,
1023 unsigned line) {
1024
1025 int r;
1026
1027 assert(name);
1028 assert(filter);
1029
1030 if (name[0] == '@') {
1031 const SyscallFilterSet *set;
1032 const char *i;
1033
1034 set = syscall_filter_set_find(name);
1035 if (!set) {
1036 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1037 return -EINVAL;
1038
1039 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1040 "Unknown system call group, ignoring: %s", name);
1041 return 0;
1042 }
1043
1044 NULSTR_FOREACH(i, set->value) {
1045 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1046 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1047 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1048 * about them. */
1049 r = seccomp_parse_syscall_filter_full(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1050 if (r < 0)
1051 return r;
1052 }
1053 } else {
1054 int id;
1055
1056 id = seccomp_syscall_resolve_name(name);
1057 if (id == __NR_SCMP_ERROR) {
1058 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1059 return -EINVAL;
1060
1061 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1062 "Failed to parse system call, ignoring: %s", name);
1063 return 0;
1064 }
1065
1066 /* If we previously wanted to forbid a syscall and now
1067 * we want to allow it, then remove it from the list. */
1068 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1069 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1070 if (r < 0)
1071 switch (r) {
1072 case -ENOMEM:
1073 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1074 case -EEXIST:
1075 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1076 break;
1077 default:
1078 return r;
1079 }
1080 } else
1081 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1082 }
1083
1084 return 0;
1085 }
1086
1087 int seccomp_restrict_namespaces(unsigned long retain) {
1088 uint32_t arch;
1089 int r;
1090
1091 if (DEBUG_LOGGING) {
1092 _cleanup_free_ char *s = NULL;
1093
1094 (void) namespace_flags_to_string(retain, &s);
1095 log_debug("Restricting namespace to: %s.", strna(s));
1096 }
1097
1098 /* NOOP? */
1099 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1100 return 0;
1101
1102 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1103 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1104 unsigned i;
1105
1106 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1107
1108 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1109 if (r < 0)
1110 return r;
1111
1112 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1113 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1114 * altogether. */
1115 r = seccomp_rule_add_exact(
1116 seccomp,
1117 SCMP_ACT_ERRNO(EPERM),
1118 SCMP_SYS(setns),
1119 0);
1120 else
1121 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1122 * special invocation with a zero flags argument, right here. */
1123 r = seccomp_rule_add_exact(
1124 seccomp,
1125 SCMP_ACT_ERRNO(EPERM),
1126 SCMP_SYS(setns),
1127 1,
1128 SCMP_A1(SCMP_CMP_EQ, 0));
1129 if (r < 0) {
1130 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1131 continue;
1132 }
1133
1134 for (i = 0; namespace_flag_map[i].name; i++) {
1135 unsigned long f;
1136
1137 f = namespace_flag_map[i].flag;
1138 if ((retain & f) == f) {
1139 log_debug("Permitting %s.", namespace_flag_map[i].name);
1140 continue;
1141 }
1142
1143 log_debug("Blocking %s.", namespace_flag_map[i].name);
1144
1145 r = seccomp_rule_add_exact(
1146 seccomp,
1147 SCMP_ACT_ERRNO(EPERM),
1148 SCMP_SYS(unshare),
1149 1,
1150 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1151 if (r < 0) {
1152 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1153 break;
1154 }
1155
1156 /* On s390/s390x the first two parameters to clone are switched */
1157 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1158 r = seccomp_rule_add_exact(
1159 seccomp,
1160 SCMP_ACT_ERRNO(EPERM),
1161 SCMP_SYS(clone),
1162 1,
1163 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1164 else
1165 r = seccomp_rule_add_exact(
1166 seccomp,
1167 SCMP_ACT_ERRNO(EPERM),
1168 SCMP_SYS(clone),
1169 1,
1170 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1171 if (r < 0) {
1172 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1173 break;
1174 }
1175
1176 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1177 r = seccomp_rule_add_exact(
1178 seccomp,
1179 SCMP_ACT_ERRNO(EPERM),
1180 SCMP_SYS(setns),
1181 1,
1182 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1183 if (r < 0) {
1184 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1185 break;
1186 }
1187 }
1188 }
1189 if (r < 0)
1190 continue;
1191
1192 r = seccomp_load(seccomp);
1193 if (IN_SET(r, -EPERM, -EACCES))
1194 return r;
1195 if (r < 0)
1196 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1197 }
1198
1199 return 0;
1200 }
1201
1202 int seccomp_protect_sysctl(void) {
1203 uint32_t arch;
1204 int r;
1205
1206 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1207 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1208
1209 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1210
1211 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1212 /* No _sysctl syscall */
1213 continue;
1214
1215 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1216 if (r < 0)
1217 return r;
1218
1219 r = seccomp_rule_add_exact(
1220 seccomp,
1221 SCMP_ACT_ERRNO(EPERM),
1222 SCMP_SYS(_sysctl),
1223 0);
1224 if (r < 0) {
1225 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1226 continue;
1227 }
1228
1229 r = seccomp_load(seccomp);
1230 if (IN_SET(r, -EPERM, -EACCES))
1231 return r;
1232 if (r < 0)
1233 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1234 }
1235
1236 return 0;
1237 }
1238
1239 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1240 uint32_t arch;
1241 int r;
1242
1243 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1244 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1245 bool supported;
1246 Iterator i;
1247
1248 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1249
1250 switch (arch) {
1251
1252 case SCMP_ARCH_X86_64:
1253 case SCMP_ARCH_X32:
1254 case SCMP_ARCH_ARM:
1255 case SCMP_ARCH_AARCH64:
1256 case SCMP_ARCH_PPC:
1257 case SCMP_ARCH_PPC64:
1258 case SCMP_ARCH_PPC64LE:
1259 case SCMP_ARCH_MIPSEL64N32:
1260 case SCMP_ARCH_MIPS64N32:
1261 case SCMP_ARCH_MIPSEL64:
1262 case SCMP_ARCH_MIPS64:
1263 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1264 supported = true;
1265 break;
1266
1267 case SCMP_ARCH_S390:
1268 case SCMP_ARCH_S390X:
1269 case SCMP_ARCH_X86:
1270 case SCMP_ARCH_MIPSEL:
1271 case SCMP_ARCH_MIPS:
1272 default:
1273 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1274 * don't know */
1275 supported = false;
1276 break;
1277 }
1278
1279 if (!supported)
1280 continue;
1281
1282 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1283 if (r < 0)
1284 return r;
1285
1286 if (whitelist) {
1287 int af, first = 0, last = 0;
1288 void *afp;
1289
1290 /* If this is a whitelist, we first block the address families that are out of range and then
1291 * everything that is not in the set. First, we find the lowest and highest address family in
1292 * the set. */
1293
1294 SET_FOREACH(afp, address_families, i) {
1295 af = PTR_TO_INT(afp);
1296
1297 if (af <= 0 || af >= af_max())
1298 continue;
1299
1300 if (first == 0 || af < first)
1301 first = af;
1302
1303 if (last == 0 || af > last)
1304 last = af;
1305 }
1306
1307 assert((first == 0) == (last == 0));
1308
1309 if (first == 0) {
1310
1311 /* No entries in the valid range, block everything */
1312 r = seccomp_rule_add_exact(
1313 seccomp,
1314 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1315 SCMP_SYS(socket),
1316 0);
1317 if (r < 0) {
1318 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1319 continue;
1320 }
1321
1322 } else {
1323
1324 /* Block everything below the first entry */
1325 r = seccomp_rule_add_exact(
1326 seccomp,
1327 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1328 SCMP_SYS(socket),
1329 1,
1330 SCMP_A0(SCMP_CMP_LT, first));
1331 if (r < 0) {
1332 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1333 continue;
1334 }
1335
1336 /* Block everything above the last entry */
1337 r = seccomp_rule_add_exact(
1338 seccomp,
1339 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1340 SCMP_SYS(socket),
1341 1,
1342 SCMP_A0(SCMP_CMP_GT, last));
1343 if (r < 0) {
1344 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1345 continue;
1346 }
1347
1348 /* Block everything between the first and last entry */
1349 for (af = 1; af < af_max(); af++) {
1350
1351 if (set_contains(address_families, INT_TO_PTR(af)))
1352 continue;
1353
1354 r = seccomp_rule_add_exact(
1355 seccomp,
1356 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1357 SCMP_SYS(socket),
1358 1,
1359 SCMP_A0(SCMP_CMP_EQ, af));
1360 if (r < 0)
1361 break;
1362 }
1363 if (r < 0) {
1364 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1365 continue;
1366 }
1367 }
1368
1369 } else {
1370 void *af;
1371
1372 /* If this is a blacklist, then generate one rule for
1373 * each address family that are then combined in OR
1374 * checks. */
1375
1376 SET_FOREACH(af, address_families, i) {
1377
1378 r = seccomp_rule_add_exact(
1379 seccomp,
1380 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1381 SCMP_SYS(socket),
1382 1,
1383 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1384 if (r < 0)
1385 break;
1386 }
1387 if (r < 0) {
1388 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1389 continue;
1390 }
1391 }
1392
1393 r = seccomp_load(seccomp);
1394 if (IN_SET(r, -EPERM, -EACCES))
1395 return r;
1396 if (r < 0)
1397 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1398 }
1399
1400 return 0;
1401 }
1402
1403 int seccomp_restrict_realtime(void) {
1404 static const int permitted_policies[] = {
1405 SCHED_OTHER,
1406 SCHED_BATCH,
1407 SCHED_IDLE,
1408 };
1409
1410 int r, max_policy = 0;
1411 uint32_t arch;
1412 unsigned i;
1413
1414 /* Determine the highest policy constant we want to allow */
1415 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1416 if (permitted_policies[i] > max_policy)
1417 max_policy = permitted_policies[i];
1418
1419 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1420 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1421 int p;
1422
1423 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1424
1425 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1426 if (r < 0)
1427 return r;
1428
1429 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1430 * whitelist. */
1431 for (p = 0; p < max_policy; p++) {
1432 bool good = false;
1433
1434 /* Check if this is in the whitelist. */
1435 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1436 if (permitted_policies[i] == p) {
1437 good = true;
1438 break;
1439 }
1440
1441 if (good)
1442 continue;
1443
1444 /* Deny this policy */
1445 r = seccomp_rule_add_exact(
1446 seccomp,
1447 SCMP_ACT_ERRNO(EPERM),
1448 SCMP_SYS(sched_setscheduler),
1449 1,
1450 SCMP_A1(SCMP_CMP_EQ, p));
1451 if (r < 0) {
1452 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1453 continue;
1454 }
1455 }
1456
1457 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1458 * unsigned here, hence no need no check for < 0 values. */
1459 r = seccomp_rule_add_exact(
1460 seccomp,
1461 SCMP_ACT_ERRNO(EPERM),
1462 SCMP_SYS(sched_setscheduler),
1463 1,
1464 SCMP_A1(SCMP_CMP_GT, max_policy));
1465 if (r < 0) {
1466 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1467 continue;
1468 }
1469
1470 r = seccomp_load(seccomp);
1471 if (IN_SET(r, -EPERM, -EACCES))
1472 return r;
1473 if (r < 0)
1474 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1475 }
1476
1477 return 0;
1478 }
1479
1480 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1481 uint32_t arch,
1482 int nr,
1483 unsigned arg_cnt,
1484 const struct scmp_arg_cmp arg) {
1485 int r;
1486
1487 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1488 if (r < 0) {
1489 _cleanup_free_ char *n = NULL;
1490
1491 n = seccomp_syscall_resolve_num_arch(arch, nr);
1492 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1493 strna(n),
1494 seccomp_arch_to_string(arch));
1495 }
1496
1497 return r;
1498 }
1499
1500 /* For known architectures, check that syscalls are indeed defined or not. */
1501 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1502 assert_cc(SCMP_SYS(shmget) > 0);
1503 assert_cc(SCMP_SYS(shmat) > 0);
1504 assert_cc(SCMP_SYS(shmdt) > 0);
1505 #endif
1506
1507 int seccomp_memory_deny_write_execute(void) {
1508 uint32_t arch;
1509 int r;
1510
1511 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1512 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1513 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1514
1515 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1516
1517 switch (arch) {
1518
1519 case SCMP_ARCH_X86:
1520 filter_syscall = SCMP_SYS(mmap2);
1521 block_syscall = SCMP_SYS(mmap);
1522 shmat_syscall = SCMP_SYS(shmat);
1523 break;
1524
1525 case SCMP_ARCH_PPC:
1526 case SCMP_ARCH_PPC64:
1527 case SCMP_ARCH_PPC64LE:
1528 filter_syscall = SCMP_SYS(mmap);
1529
1530 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1531 * We ignore that here, which means there's still a way to get writable/executable
1532 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1533
1534 break;
1535
1536 case SCMP_ARCH_ARM:
1537 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1538 shmat_syscall = SCMP_SYS(shmat);
1539 break;
1540
1541 case SCMP_ARCH_X86_64:
1542 case SCMP_ARCH_X32:
1543 case SCMP_ARCH_AARCH64:
1544 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1545 shmat_syscall = SCMP_SYS(shmat);
1546 break;
1547
1548 /* Please add more definitions here, if you port systemd to other architectures! */
1549
1550 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1551 #warning "Consider adding the right mmap() syscall definitions here!"
1552 #endif
1553 }
1554
1555 /* Can't filter mmap() on this arch, then skip it */
1556 if (filter_syscall == 0)
1557 continue;
1558
1559 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1560 if (r < 0)
1561 return r;
1562
1563 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1564 1,
1565 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1566 if (r < 0)
1567 continue;
1568
1569 if (block_syscall != 0) {
1570 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1571 if (r < 0)
1572 continue;
1573 }
1574
1575 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1576 1,
1577 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1578 if (r < 0)
1579 continue;
1580
1581 #ifdef __NR_pkey_mprotect
1582 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1583 1,
1584 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1585 if (r < 0)
1586 continue;
1587 #endif
1588
1589 if (shmat_syscall > 0) {
1590 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1591 1,
1592 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1593 if (r < 0)
1594 continue;
1595 }
1596
1597 r = seccomp_load(seccomp);
1598 if (IN_SET(r, -EPERM, -EACCES))
1599 return r;
1600 if (r < 0)
1601 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1602 }
1603
1604 return 0;
1605 }
1606
1607 int seccomp_restrict_archs(Set *archs) {
1608 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1609 Iterator i;
1610 void *id;
1611 int r;
1612
1613 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1614 * list.
1615 *
1616 * There are some qualifications. However the most important use is to stop processes from bypassing
1617 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1618 * in a non-native architecture. There are no holes in this use case, at least so far. */
1619
1620 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1621 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1622 * to run a program with the restrictions applied. */
1623 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1624 if (!seccomp)
1625 return -ENOMEM;
1626
1627 SET_FOREACH(id, archs, i) {
1628 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1629 if (r < 0 && r != -EEXIST)
1630 return r;
1631 }
1632
1633 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1634 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1635 * The important thing is that you can block the old 32-bit x86 syscalls.
1636 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1637
1638 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1639 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1640
1641 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1642 if (r < 0 && r != -EEXIST)
1643 return r;
1644 }
1645
1646 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1647 if (r < 0)
1648 return r;
1649
1650 r = seccomp_load(seccomp);
1651 if (IN_SET(r, -EPERM, -EACCES))
1652 return r;
1653 if (r < 0)
1654 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1655
1656 return 0;
1657 }
1658
1659 int parse_syscall_archs(char **l, Set **archs) {
1660 _cleanup_set_free_ Set *_archs;
1661 char **s;
1662 int r;
1663
1664 assert(l);
1665 assert(archs);
1666
1667 r = set_ensure_allocated(&_archs, NULL);
1668 if (r < 0)
1669 return r;
1670
1671 STRV_FOREACH(s, l) {
1672 uint32_t a;
1673
1674 r = seccomp_arch_from_string(*s, &a);
1675 if (r < 0)
1676 return -EINVAL;
1677
1678 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1679 if (r < 0)
1680 return -ENOMEM;
1681 }
1682
1683 *archs = TAKE_PTR(_archs);
1684
1685 return 0;
1686 }
1687
1688 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1689 const char *i;
1690 int r;
1691
1692 assert(set);
1693
1694 NULSTR_FOREACH(i, set->value) {
1695
1696 if (i[0] == '@') {
1697 const SyscallFilterSet *more;
1698
1699 more = syscall_filter_set_find(i);
1700 if (!more)
1701 return -ENXIO;
1702
1703 r = seccomp_filter_set_add(filter, add, more);
1704 if (r < 0)
1705 return r;
1706 } else {
1707 int id;
1708
1709 id = seccomp_syscall_resolve_name(i);
1710 if (id == __NR_SCMP_ERROR) {
1711 log_debug("Couldn't resolve system call, ignoring: %s", i);
1712 continue;
1713 }
1714
1715 if (add) {
1716 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1717 if (r < 0)
1718 return r;
1719 } else
1720 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1721 }
1722 }
1723
1724 return 0;
1725 }
1726
1727 int seccomp_lock_personality(unsigned long personality) {
1728 uint32_t arch;
1729 int r;
1730
1731 if (personality >= PERSONALITY_INVALID)
1732 return -EINVAL;
1733
1734 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1735 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1736
1737 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1738 if (r < 0)
1739 return r;
1740
1741 r = seccomp_rule_add_exact(
1742 seccomp,
1743 SCMP_ACT_ERRNO(EPERM),
1744 SCMP_SYS(personality),
1745 1,
1746 SCMP_A0(SCMP_CMP_NE, personality));
1747 if (r < 0) {
1748 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1749 continue;
1750 }
1751
1752 r = seccomp_load(seccomp);
1753 if (IN_SET(r, -EPERM, -EACCES))
1754 return r;
1755 if (r < 0)
1756 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1757 }
1758
1759 return 0;
1760 }
1761
1762 int seccomp_protect_hostname(void) {
1763 uint32_t arch;
1764 int r;
1765
1766 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1767 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1768
1769 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1770 if (r < 0)
1771 return r;
1772
1773 r = seccomp_rule_add_exact(
1774 seccomp,
1775 SCMP_ACT_ERRNO(EPERM),
1776 SCMP_SYS(sethostname),
1777 0);
1778 if (r < 0)
1779 continue;
1780
1781 r = seccomp_rule_add_exact(
1782 seccomp,
1783 SCMP_ACT_ERRNO(EPERM),
1784 SCMP_SYS(setdomainname),
1785 0);
1786 if (r < 0)
1787 continue;
1788
1789 r = seccomp_load(seccomp);
1790 if (IN_SET(r, -EPERM, -EACCES))
1791 return r;
1792 if (r < 0)
1793 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1794 }
1795
1796 return 0;
1797 }