]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
seccomp: fix multiplexed system calls
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <seccomp.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/prctl.h>
10 #include <sys/shm.h>
11 #include <sys/stat.h>
12
13 #include "af-list.h"
14 #include "alloc-util.h"
15 #include "errno-list.h"
16 #include "macro.h"
17 #include "nsflags.h"
18 #include "nulstr-util.h"
19 #include "process-util.h"
20 #include "seccomp-util.h"
21 #include "set.h"
22 #include "string-util.h"
23 #include "strv.h"
24
25 const uint32_t seccomp_local_archs[] = {
26
27 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
28
29 #if defined(__x86_64__) && defined(__ILP32__)
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
32 SCMP_ARCH_X32, /* native */
33 #elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
35 SCMP_ARCH_X32,
36 SCMP_ARCH_X86_64, /* native */
37 #elif defined(__i386__)
38 SCMP_ARCH_X86,
39 #elif defined(__aarch64__)
40 SCMP_ARCH_ARM,
41 SCMP_ARCH_AARCH64, /* native */
42 #elif defined(__arm__)
43 SCMP_ARCH_ARM,
44 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
48 SCMP_ARCH_MIPS,
49 SCMP_ARCH_MIPSEL, /* native */
50 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
54 SCMP_ARCH_MIPS64N32,
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
67 SCMP_ARCH_MIPSEL64,
68 SCMP_ARCH_MIPS64,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64N32, /* native */
71 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
79 SCMP_ARCH_PPC,
80 SCMP_ARCH_PPC64LE,
81 SCMP_ARCH_PPC64, /* native */
82 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86 #elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88 #elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91 #elif defined(__s390__)
92 SCMP_ARCH_S390,
93 #endif
94 (uint32_t) -1
95 };
96
97 const char* seccomp_arch_to_string(uint32_t c) {
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
102
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
105 return "native";
106 case SCMP_ARCH_X86:
107 return "x86";
108 case SCMP_ARCH_X86_64:
109 return "x86-64";
110 case SCMP_ARCH_X32:
111 return "x32";
112 case SCMP_ARCH_ARM:
113 return "arm";
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
135 return "s390";
136 case SCMP_ARCH_S390X:
137 return "s390x";
138 default:
139 return NULL;
140 }
141 }
142
143 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
183 else
184 return -EINVAL;
185
186 return 0;
187 }
188
189 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
190 scmp_filter_ctx seccomp;
191 int r;
192
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
204 if (r < 0)
205 goto finish;
206
207 r = seccomp_arch_add(seccomp, arch);
208 if (r < 0)
209 goto finish;
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
225 goto finish;
226
227 *ret = seccomp;
228 return 0;
229
230 finish:
231 seccomp_release(seccomp);
232 return r;
233 }
234
235 static bool is_basic_seccomp_available(void) {
236 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
237 }
238
239 static bool is_seccomp_filter_available(void) {
240 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 errno == EFAULT;
242 }
243
244 bool is_seccomp_available(void) {
245 static int cached_enabled = -1;
246
247 if (cached_enabled < 0)
248 cached_enabled =
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
251
252 return cached_enabled;
253 }
254
255 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
256 [SYSCALL_FILTER_SET_DEFAULT] = {
257 .name = "@default",
258 .help = "System calls that are always permitted",
259 .value =
260 "clock_getres\0"
261 "clock_getres_time64\0"
262 "clock_gettime\0"
263 "clock_gettime64\0"
264 "clock_nanosleep\0"
265 "clock_nanosleep_time64\0"
266 "execve\0"
267 "exit\0"
268 "exit_group\0"
269 "futex\0"
270 "futex_time64\0"
271 "get_robust_list\0"
272 "get_thread_area\0"
273 "getegid\0"
274 "getegid32\0"
275 "geteuid\0"
276 "geteuid32\0"
277 "getgid\0"
278 "getgid32\0"
279 "getgroups\0"
280 "getgroups32\0"
281 "getpgid\0"
282 "getpgrp\0"
283 "getpid\0"
284 "getppid\0"
285 "getresgid\0"
286 "getresgid32\0"
287 "getresuid\0"
288 "getresuid32\0"
289 "getrlimit\0" /* make sure processes can query stack size and such */
290 "getsid\0"
291 "gettid\0"
292 "gettimeofday\0"
293 "getuid\0"
294 "getuid32\0"
295 "membarrier\0"
296 "nanosleep\0"
297 "pause\0"
298 "prlimit64\0"
299 "restart_syscall\0"
300 "rseq\0"
301 "rt_sigreturn\0"
302 "sched_yield\0"
303 "set_robust_list\0"
304 "set_thread_area\0"
305 "set_tid_address\0"
306 "set_tls\0"
307 "sigreturn\0"
308 "time\0"
309 "ugetrlimit\0"
310 },
311 [SYSCALL_FILTER_SET_AIO] = {
312 .name = "@aio",
313 .help = "Asynchronous IO",
314 .value =
315 "io_cancel\0"
316 "io_destroy\0"
317 "io_getevents\0"
318 "io_pgetevents\0"
319 "io_pgetevents_time64\0"
320 "io_setup\0"
321 "io_submit\0"
322 "io_uring_enter\0"
323 "io_uring_register\0"
324 "io_uring_setup\0"
325 },
326 [SYSCALL_FILTER_SET_BASIC_IO] = {
327 .name = "@basic-io",
328 .help = "Basic IO",
329 .value =
330 "_llseek\0"
331 "close\0"
332 "dup\0"
333 "dup2\0"
334 "dup3\0"
335 "lseek\0"
336 "pread64\0"
337 "preadv\0"
338 "preadv2\0"
339 "pwrite64\0"
340 "pwritev\0"
341 "pwritev2\0"
342 "read\0"
343 "readv\0"
344 "write\0"
345 "writev\0"
346 },
347 [SYSCALL_FILTER_SET_CHOWN] = {
348 .name = "@chown",
349 .help = "Change ownership of files and directories",
350 .value =
351 "chown\0"
352 "chown32\0"
353 "fchown\0"
354 "fchown32\0"
355 "fchownat\0"
356 "lchown\0"
357 "lchown32\0"
358 },
359 [SYSCALL_FILTER_SET_CLOCK] = {
360 .name = "@clock",
361 .help = "Change the system time",
362 .value =
363 "adjtimex\0"
364 "clock_adjtime\0"
365 "clock_adjtime64\0"
366 "clock_settime\0"
367 "clock_settime64\0"
368 "settimeofday\0"
369 "stime\0"
370 },
371 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
372 .name = "@cpu-emulation",
373 .help = "System calls for CPU emulation functionality",
374 .value =
375 "modify_ldt\0"
376 "subpage_prot\0"
377 "switch_endian\0"
378 "vm86\0"
379 "vm86old\0"
380 },
381 [SYSCALL_FILTER_SET_DEBUG] = {
382 .name = "@debug",
383 .help = "Debugging, performance monitoring and tracing functionality",
384 .value =
385 "lookup_dcookie\0"
386 "perf_event_open\0"
387 "ptrace\0"
388 "rtas\0"
389 #ifdef __NR_s390_runtime_instr
390 "s390_runtime_instr\0"
391 #endif
392 "sys_debug_setcontext\0"
393 },
394 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
395 .name = "@file-system",
396 .help = "File system operations",
397 .value =
398 "access\0"
399 "chdir\0"
400 "chmod\0"
401 "close\0"
402 "creat\0"
403 "faccessat\0"
404 "fallocate\0"
405 "fchdir\0"
406 "fchmod\0"
407 "fchmodat\0"
408 "fcntl\0"
409 "fcntl64\0"
410 "fgetxattr\0"
411 "flistxattr\0"
412 "fremovexattr\0"
413 "fsetxattr\0"
414 "fstat\0"
415 "fstat64\0"
416 "fstatat64\0"
417 "fstatfs\0"
418 "fstatfs64\0"
419 "ftruncate\0"
420 "ftruncate64\0"
421 "futimesat\0"
422 "getcwd\0"
423 "getdents\0"
424 "getdents64\0"
425 "getxattr\0"
426 "inotify_add_watch\0"
427 "inotify_init\0"
428 "inotify_init1\0"
429 "inotify_rm_watch\0"
430 "lgetxattr\0"
431 "link\0"
432 "linkat\0"
433 "listxattr\0"
434 "llistxattr\0"
435 "lremovexattr\0"
436 "lsetxattr\0"
437 "lstat\0"
438 "lstat64\0"
439 "mkdir\0"
440 "mkdirat\0"
441 "mknod\0"
442 "mknodat\0"
443 "mmap\0"
444 "mmap2\0"
445 "munmap\0"
446 "newfstatat\0"
447 "oldfstat\0"
448 "oldlstat\0"
449 "oldstat\0"
450 "open\0"
451 "openat\0"
452 "readlink\0"
453 "readlinkat\0"
454 "removexattr\0"
455 "rename\0"
456 "renameat\0"
457 "renameat2\0"
458 "rmdir\0"
459 "setxattr\0"
460 "stat\0"
461 "stat64\0"
462 "statfs\0"
463 "statfs64\0"
464 #ifdef __NR_statx
465 "statx\0"
466 #endif
467 "symlink\0"
468 "symlinkat\0"
469 "truncate\0"
470 "truncate64\0"
471 "unlink\0"
472 "unlinkat\0"
473 "utime\0"
474 "utimensat\0"
475 "utimensat_time64\0"
476 "utimes\0"
477 },
478 [SYSCALL_FILTER_SET_IO_EVENT] = {
479 .name = "@io-event",
480 .help = "Event loop system calls",
481 .value =
482 "_newselect\0"
483 "epoll_create\0"
484 "epoll_create1\0"
485 "epoll_ctl\0"
486 "epoll_ctl_old\0"
487 "epoll_pwait\0"
488 "epoll_wait\0"
489 "epoll_wait_old\0"
490 "eventfd\0"
491 "eventfd2\0"
492 "poll\0"
493 "ppoll\0"
494 "ppoll_time64\0"
495 "pselect6\0"
496 "pselect6_time64\0"
497 "select\0"
498 },
499 [SYSCALL_FILTER_SET_IPC] = {
500 .name = "@ipc",
501 .help = "SysV IPC, POSIX Message Queues or other IPC",
502 .value =
503 "ipc\0"
504 "memfd_create\0"
505 "mq_getsetattr\0"
506 "mq_notify\0"
507 "mq_open\0"
508 "mq_timedreceive\0"
509 "mq_timedreceive_time64\0"
510 "mq_timedsend\0"
511 "mq_timedsend_time64\0"
512 "mq_unlink\0"
513 "msgctl\0"
514 "msgget\0"
515 "msgrcv\0"
516 "msgsnd\0"
517 "pipe\0"
518 "pipe2\0"
519 "process_vm_readv\0"
520 "process_vm_writev\0"
521 "semctl\0"
522 "semget\0"
523 "semop\0"
524 "semtimedop\0"
525 "semtimedop_time64\0"
526 "shmat\0"
527 "shmctl\0"
528 "shmdt\0"
529 "shmget\0"
530 },
531 [SYSCALL_FILTER_SET_KEYRING] = {
532 .name = "@keyring",
533 .help = "Kernel keyring access",
534 .value =
535 "add_key\0"
536 "keyctl\0"
537 "request_key\0"
538 },
539 [SYSCALL_FILTER_SET_MEMLOCK] = {
540 .name = "@memlock",
541 .help = "Memory locking control",
542 .value =
543 "mlock\0"
544 "mlock2\0"
545 "mlockall\0"
546 "munlock\0"
547 "munlockall\0"
548 },
549 [SYSCALL_FILTER_SET_MODULE] = {
550 .name = "@module",
551 .help = "Loading and unloading of kernel modules",
552 .value =
553 "delete_module\0"
554 "finit_module\0"
555 "init_module\0"
556 },
557 [SYSCALL_FILTER_SET_MOUNT] = {
558 .name = "@mount",
559 .help = "Mounting and unmounting of file systems",
560 .value =
561 "chroot\0"
562 "fsconfig\0"
563 "fsmount\0"
564 "fsopen\0"
565 "fspick\0"
566 "mount\0"
567 "move_mount\0"
568 "open_tree\0"
569 "pivot_root\0"
570 "umount\0"
571 "umount2\0"
572 },
573 [SYSCALL_FILTER_SET_NETWORK_IO] = {
574 .name = "@network-io",
575 .help = "Network or Unix socket IO, should not be needed if not network facing",
576 .value =
577 "accept\0"
578 "accept4\0"
579 "bind\0"
580 "connect\0"
581 "getpeername\0"
582 "getsockname\0"
583 "getsockopt\0"
584 "listen\0"
585 "recv\0"
586 "recvfrom\0"
587 "recvmmsg\0"
588 "recvmmsg_time64\0"
589 "recvmsg\0"
590 "send\0"
591 "sendmmsg\0"
592 "sendmsg\0"
593 "sendto\0"
594 "setsockopt\0"
595 "shutdown\0"
596 "socket\0"
597 "socketcall\0"
598 "socketpair\0"
599 },
600 [SYSCALL_FILTER_SET_OBSOLETE] = {
601 /* some unknown even to libseccomp */
602 .name = "@obsolete",
603 .help = "Unusual, obsolete or unimplemented system calls",
604 .value =
605 "_sysctl\0"
606 "afs_syscall\0"
607 "bdflush\0"
608 "break\0"
609 "create_module\0"
610 "ftime\0"
611 "get_kernel_syms\0"
612 "getpmsg\0"
613 "gtty\0"
614 "idle\0"
615 "lock\0"
616 "mpx\0"
617 "prof\0"
618 "profil\0"
619 "putpmsg\0"
620 "query_module\0"
621 "security\0"
622 "sgetmask\0"
623 "ssetmask\0"
624 "stty\0"
625 "sysfs\0"
626 "tuxcall\0"
627 "ulimit\0"
628 "uselib\0"
629 "ustat\0"
630 "vserver\0"
631 },
632 [SYSCALL_FILTER_SET_PKEY] = {
633 .name = "@pkey",
634 .help = "System calls used for memory protection keys",
635 .value =
636 "pkey_alloc\0"
637 "pkey_free\0"
638 "pkey_mprotect\0"
639 },
640 [SYSCALL_FILTER_SET_PRIVILEGED] = {
641 .name = "@privileged",
642 .help = "All system calls which need super-user capabilities",
643 .value =
644 "@chown\0"
645 "@clock\0"
646 "@module\0"
647 "@raw-io\0"
648 "@reboot\0"
649 "@swap\0"
650 "_sysctl\0"
651 "acct\0"
652 "bpf\0"
653 "capset\0"
654 "chroot\0"
655 "fanotify_init\0"
656 "fanotify_mark\0"
657 "nfsservctl\0"
658 "open_by_handle_at\0"
659 "pivot_root\0"
660 "quotactl\0"
661 "setdomainname\0"
662 "setfsuid\0"
663 "setfsuid32\0"
664 "setgroups\0"
665 "setgroups32\0"
666 "sethostname\0"
667 "setresuid\0"
668 "setresuid32\0"
669 "setreuid\0"
670 "setreuid32\0"
671 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
672 "setuid32\0"
673 "vhangup\0"
674 },
675 [SYSCALL_FILTER_SET_PROCESS] = {
676 .name = "@process",
677 .help = "Process control, execution, namespaceing operations",
678 .value =
679 "arch_prctl\0"
680 "capget\0" /* Able to query arbitrary processes */
681 "clone\0"
682 "clone3\0"
683 "execveat\0"
684 "fork\0"
685 "getrusage\0"
686 "kill\0"
687 "pidfd_open\0"
688 "pidfd_send_signal\0"
689 "prctl\0"
690 "rt_sigqueueinfo\0"
691 "rt_tgsigqueueinfo\0"
692 "setns\0"
693 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
694 "tgkill\0"
695 "times\0"
696 "tkill\0"
697 "unshare\0"
698 "vfork\0"
699 "wait4\0"
700 "waitid\0"
701 "waitpid\0"
702 },
703 [SYSCALL_FILTER_SET_RAW_IO] = {
704 .name = "@raw-io",
705 .help = "Raw I/O port access",
706 .value =
707 "ioperm\0"
708 "iopl\0"
709 "pciconfig_iobase\0"
710 "pciconfig_read\0"
711 "pciconfig_write\0"
712 #ifdef __NR_s390_pci_mmio_read
713 "s390_pci_mmio_read\0"
714 #endif
715 #ifdef __NR_s390_pci_mmio_write
716 "s390_pci_mmio_write\0"
717 #endif
718 },
719 [SYSCALL_FILTER_SET_REBOOT] = {
720 .name = "@reboot",
721 .help = "Reboot and reboot preparation/kexec",
722 .value =
723 "kexec_file_load\0"
724 "kexec_load\0"
725 "reboot\0"
726 },
727 [SYSCALL_FILTER_SET_RESOURCES] = {
728 .name = "@resources",
729 .help = "Alter resource settings",
730 .value =
731 "ioprio_set\0"
732 "mbind\0"
733 "migrate_pages\0"
734 "move_pages\0"
735 "nice\0"
736 "sched_setaffinity\0"
737 "sched_setattr\0"
738 "sched_setparam\0"
739 "sched_setscheduler\0"
740 "set_mempolicy\0"
741 "setpriority\0"
742 "setrlimit\0"
743 },
744 [SYSCALL_FILTER_SET_SETUID] = {
745 .name = "@setuid",
746 .help = "Operations for changing user/group credentials",
747 .value =
748 "setgid\0"
749 "setgid32\0"
750 "setgroups\0"
751 "setgroups32\0"
752 "setregid\0"
753 "setregid32\0"
754 "setresgid\0"
755 "setresgid32\0"
756 "setresuid\0"
757 "setresuid32\0"
758 "setreuid\0"
759 "setreuid32\0"
760 "setuid\0"
761 "setuid32\0"
762 },
763 [SYSCALL_FILTER_SET_SIGNAL] = {
764 .name = "@signal",
765 .help = "Process signal handling",
766 .value =
767 "rt_sigaction\0"
768 "rt_sigpending\0"
769 "rt_sigprocmask\0"
770 "rt_sigsuspend\0"
771 "rt_sigtimedwait\0"
772 "rt_sigtimedwait_time64\0"
773 "sigaction\0"
774 "sigaltstack\0"
775 "signal\0"
776 "signalfd\0"
777 "signalfd4\0"
778 "sigpending\0"
779 "sigprocmask\0"
780 "sigsuspend\0"
781 },
782 [SYSCALL_FILTER_SET_SWAP] = {
783 .name = "@swap",
784 .help = "Enable/disable swap devices",
785 .value =
786 "swapoff\0"
787 "swapon\0"
788 },
789 [SYSCALL_FILTER_SET_SYNC] = {
790 .name = "@sync",
791 .help = "Synchronize files and memory to storage",
792 .value =
793 "fdatasync\0"
794 "fsync\0"
795 "msync\0"
796 "sync\0"
797 "sync_file_range\0"
798 "sync_file_range2\0"
799 "syncfs\0"
800 },
801 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
802 .name = "@system-service",
803 .help = "General system service operations",
804 .value =
805 "@aio\0"
806 "@basic-io\0"
807 "@chown\0"
808 "@default\0"
809 "@file-system\0"
810 "@io-event\0"
811 "@ipc\0"
812 "@keyring\0"
813 "@memlock\0"
814 "@network-io\0"
815 "@process\0"
816 "@resources\0"
817 "@setuid\0"
818 "@signal\0"
819 "@sync\0"
820 "@timer\0"
821 "brk\0"
822 "capget\0"
823 "capset\0"
824 "copy_file_range\0"
825 "fadvise64\0"
826 "fadvise64_64\0"
827 "flock\0"
828 "get_mempolicy\0"
829 "getcpu\0"
830 "getpriority\0"
831 "getrandom\0"
832 "ioctl\0"
833 "ioprio_get\0"
834 "kcmp\0"
835 "madvise\0"
836 "mprotect\0"
837 "mremap\0"
838 "name_to_handle_at\0"
839 "oldolduname\0"
840 "olduname\0"
841 "personality\0"
842 "readahead\0"
843 "readdir\0"
844 "remap_file_pages\0"
845 "sched_get_priority_max\0"
846 "sched_get_priority_min\0"
847 "sched_getaffinity\0"
848 "sched_getattr\0"
849 "sched_getparam\0"
850 "sched_getscheduler\0"
851 "sched_rr_get_interval\0"
852 "sched_rr_get_interval_time64\0"
853 "sched_yield\0"
854 "sendfile\0"
855 "sendfile64\0"
856 "setfsgid\0"
857 "setfsgid32\0"
858 "setfsuid\0"
859 "setfsuid32\0"
860 "setpgid\0"
861 "setsid\0"
862 "splice\0"
863 "sysinfo\0"
864 "tee\0"
865 "umask\0"
866 "uname\0"
867 "userfaultfd\0"
868 "vmsplice\0"
869 },
870 [SYSCALL_FILTER_SET_TIMER] = {
871 .name = "@timer",
872 .help = "Schedule operations by time",
873 .value =
874 "alarm\0"
875 "getitimer\0"
876 "setitimer\0"
877 "timer_create\0"
878 "timer_delete\0"
879 "timer_getoverrun\0"
880 "timer_gettime\0"
881 "timer_gettime64\0"
882 "timer_settime\0"
883 "timer_settime64\0"
884 "timerfd_create\0"
885 "timerfd_gettime\0"
886 "timerfd_gettime64\0"
887 "timerfd_settime\0"
888 "timerfd_settime64\0"
889 "times\0"
890 },
891 };
892
893 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
894 unsigned i;
895
896 if (isempty(name) || name[0] != '@')
897 return NULL;
898
899 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
900 if (streq(syscall_filter_sets[i].name, name))
901 return syscall_filter_sets + i;
902
903 return NULL;
904 }
905
906 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
907
908 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
909 assert(seccomp);
910 assert(name);
911
912 if (strv_contains(exclude, name))
913 return 0;
914
915 if (name[0] == '@') {
916 const SyscallFilterSet *other;
917
918 other = syscall_filter_set_find(name);
919 if (!other)
920 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
921 "Filter set %s is not known!",
922 name);
923
924 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
925
926 } else {
927 int id, r;
928
929 id = seccomp_syscall_resolve_name(name);
930 if (id == __NR_SCMP_ERROR) {
931 if (log_missing)
932 log_debug("System call %s is not known, ignoring.", name);
933 return 0;
934 }
935
936 r = seccomp_rule_add_exact(seccomp, action, id, 0);
937 if (r < 0) {
938 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
939 bool ignore = r == -EDOM;
940
941 if (!ignore || log_missing)
942 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
943 name, id, ignore ? ", ignoring" : "");
944 if (!ignore)
945 return r;
946 }
947
948 return 0;
949 }
950 }
951
952 static int seccomp_add_syscall_filter_set(
953 scmp_filter_ctx seccomp,
954 const SyscallFilterSet *set,
955 uint32_t action,
956 char **exclude,
957 bool log_missing) {
958
959 const char *sys;
960 int r;
961
962 assert(seccomp);
963 assert(set);
964
965 NULSTR_FOREACH(sys, set->value) {
966 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
967 if (r < 0)
968 return r;
969 }
970
971 return 0;
972 }
973
974 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
975 uint32_t arch;
976 int r;
977
978 assert(set);
979
980 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
981 * each local arch. */
982
983 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
984 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
985
986 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
987
988 r = seccomp_init_for_arch(&seccomp, arch, default_action);
989 if (r < 0)
990 return r;
991
992 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
993 if (r < 0)
994 return log_debug_errno(r, "Failed to add filter set: %m");
995
996 r = seccomp_load(seccomp);
997 if (ERRNO_IS_SECCOMP_FATAL(r))
998 return r;
999 if (r < 0)
1000 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1001 }
1002
1003 return 0;
1004 }
1005
1006 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
1007 uint32_t arch;
1008 int r;
1009
1010 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1011 * SyscallFilterSet* table. */
1012
1013 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
1014 return 0;
1015
1016 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1017 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1018 Iterator i;
1019 void *syscall_id, *val;
1020
1021 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1022
1023 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1024 if (r < 0)
1025 return r;
1026
1027 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
1028 uint32_t a = action;
1029 int id = PTR_TO_INT(syscall_id) - 1;
1030 int error = PTR_TO_INT(val);
1031
1032 if (action != SCMP_ACT_ALLOW && error >= 0)
1033 a = SCMP_ACT_ERRNO(error);
1034
1035 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1036 if (r < 0) {
1037 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1038 _cleanup_free_ char *n = NULL;
1039 bool ignore;
1040
1041 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1042 ignore = r == -EDOM;
1043 if (!ignore || log_missing)
1044 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1045 strna(n), id, ignore ? ", ignoring" : "");
1046 if (!ignore)
1047 return r;
1048 }
1049 }
1050
1051 r = seccomp_load(seccomp);
1052 if (ERRNO_IS_SECCOMP_FATAL(r))
1053 return r;
1054 if (r < 0)
1055 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1056 }
1057
1058 return 0;
1059 }
1060
1061 int seccomp_parse_syscall_filter(
1062 const char *name,
1063 int errno_num,
1064 Hashmap *filter,
1065 SeccompParseFlags flags,
1066 const char *unit,
1067 const char *filename,
1068 unsigned line) {
1069
1070 int r;
1071
1072 assert(name);
1073 assert(filter);
1074
1075 if (name[0] == '@') {
1076 const SyscallFilterSet *set;
1077 const char *i;
1078
1079 set = syscall_filter_set_find(name);
1080 if (!set) {
1081 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1082 return -EINVAL;
1083
1084 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1085 "Unknown system call group, ignoring: %s", name);
1086 return 0;
1087 }
1088
1089 NULSTR_FOREACH(i, set->value) {
1090 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1091 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1092 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1093 * about them. */
1094 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1095 if (r < 0)
1096 return r;
1097 }
1098 } else {
1099 int id;
1100
1101 id = seccomp_syscall_resolve_name(name);
1102 if (id == __NR_SCMP_ERROR) {
1103 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1104 return -EINVAL;
1105
1106 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1107 "Failed to parse system call, ignoring: %s", name);
1108 return 0;
1109 }
1110
1111 /* If we previously wanted to forbid a syscall and now
1112 * we want to allow it, then remove it from the list. */
1113 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1114 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1115 if (r < 0)
1116 switch (r) {
1117 case -ENOMEM:
1118 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1119 case -EEXIST:
1120 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1121 break;
1122 default:
1123 return r;
1124 }
1125 } else
1126 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1127 }
1128
1129 return 0;
1130 }
1131
1132 int seccomp_restrict_namespaces(unsigned long retain) {
1133 uint32_t arch;
1134 int r;
1135
1136 if (DEBUG_LOGGING) {
1137 _cleanup_free_ char *s = NULL;
1138
1139 (void) namespace_flags_to_string(retain, &s);
1140 log_debug("Restricting namespace to: %s.", strna(s));
1141 }
1142
1143 /* NOOP? */
1144 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1145 return 0;
1146
1147 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1148 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1149 unsigned i;
1150
1151 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1152
1153 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1154 if (r < 0)
1155 return r;
1156
1157 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1158 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1159 * altogether. */
1160 r = seccomp_rule_add_exact(
1161 seccomp,
1162 SCMP_ACT_ERRNO(EPERM),
1163 SCMP_SYS(setns),
1164 0);
1165 else
1166 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1167 * special invocation with a zero flags argument, right here. */
1168 r = seccomp_rule_add_exact(
1169 seccomp,
1170 SCMP_ACT_ERRNO(EPERM),
1171 SCMP_SYS(setns),
1172 1,
1173 SCMP_A1(SCMP_CMP_EQ, 0));
1174 if (r < 0) {
1175 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1176 continue;
1177 }
1178
1179 for (i = 0; namespace_flag_map[i].name; i++) {
1180 unsigned long f;
1181
1182 f = namespace_flag_map[i].flag;
1183 if ((retain & f) == f) {
1184 log_debug("Permitting %s.", namespace_flag_map[i].name);
1185 continue;
1186 }
1187
1188 log_debug("Blocking %s.", namespace_flag_map[i].name);
1189
1190 r = seccomp_rule_add_exact(
1191 seccomp,
1192 SCMP_ACT_ERRNO(EPERM),
1193 SCMP_SYS(unshare),
1194 1,
1195 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1196 if (r < 0) {
1197 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1198 break;
1199 }
1200
1201 /* On s390/s390x the first two parameters to clone are switched */
1202 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1203 r = seccomp_rule_add_exact(
1204 seccomp,
1205 SCMP_ACT_ERRNO(EPERM),
1206 SCMP_SYS(clone),
1207 1,
1208 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1209 else
1210 r = seccomp_rule_add_exact(
1211 seccomp,
1212 SCMP_ACT_ERRNO(EPERM),
1213 SCMP_SYS(clone),
1214 1,
1215 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1216 if (r < 0) {
1217 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1218 break;
1219 }
1220
1221 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1222 r = seccomp_rule_add_exact(
1223 seccomp,
1224 SCMP_ACT_ERRNO(EPERM),
1225 SCMP_SYS(setns),
1226 1,
1227 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1228 if (r < 0) {
1229 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1230 break;
1231 }
1232 }
1233 }
1234 if (r < 0)
1235 continue;
1236
1237 r = seccomp_load(seccomp);
1238 if (ERRNO_IS_SECCOMP_FATAL(r))
1239 return r;
1240 if (r < 0)
1241 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1242 }
1243
1244 return 0;
1245 }
1246
1247 int seccomp_protect_sysctl(void) {
1248 uint32_t arch;
1249 int r;
1250
1251 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1252 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1253
1254 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1255
1256 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1257 /* No _sysctl syscall */
1258 continue;
1259
1260 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1261 if (r < 0)
1262 return r;
1263
1264 r = seccomp_rule_add_exact(
1265 seccomp,
1266 SCMP_ACT_ERRNO(EPERM),
1267 SCMP_SYS(_sysctl),
1268 0);
1269 if (r < 0) {
1270 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1271 continue;
1272 }
1273
1274 r = seccomp_load(seccomp);
1275 if (ERRNO_IS_SECCOMP_FATAL(r))
1276 return r;
1277 if (r < 0)
1278 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1279 }
1280
1281 return 0;
1282 }
1283
1284 int seccomp_protect_syslog(void) {
1285 uint32_t arch;
1286 int r;
1287
1288 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1289 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1290
1291 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1292 if (r < 0)
1293 return r;
1294
1295 r = seccomp_rule_add_exact(
1296 seccomp,
1297 SCMP_ACT_ERRNO(EPERM),
1298 SCMP_SYS(syslog),
1299 0);
1300
1301 if (r < 0) {
1302 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1303 continue;
1304 }
1305
1306 r = seccomp_load(seccomp);
1307 if (ERRNO_IS_SECCOMP_FATAL(r))
1308 return r;
1309 if (r < 0)
1310 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1311 }
1312
1313 return 0;
1314 }
1315
1316 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1317 uint32_t arch;
1318 int r;
1319
1320 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1321 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1322 bool supported;
1323 Iterator i;
1324
1325 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1326
1327 switch (arch) {
1328
1329 case SCMP_ARCH_X86_64:
1330 case SCMP_ARCH_X32:
1331 case SCMP_ARCH_ARM:
1332 case SCMP_ARCH_AARCH64:
1333 case SCMP_ARCH_PPC:
1334 case SCMP_ARCH_PPC64:
1335 case SCMP_ARCH_PPC64LE:
1336 case SCMP_ARCH_MIPSEL64N32:
1337 case SCMP_ARCH_MIPS64N32:
1338 case SCMP_ARCH_MIPSEL64:
1339 case SCMP_ARCH_MIPS64:
1340 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1341 supported = true;
1342 break;
1343
1344 case SCMP_ARCH_S390:
1345 case SCMP_ARCH_S390X:
1346 case SCMP_ARCH_X86:
1347 case SCMP_ARCH_MIPSEL:
1348 case SCMP_ARCH_MIPS:
1349 default:
1350 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1351 * don't know */
1352 supported = false;
1353 break;
1354 }
1355
1356 if (!supported)
1357 continue;
1358
1359 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1360 if (r < 0)
1361 return r;
1362
1363 if (whitelist) {
1364 int af, first = 0, last = 0;
1365 void *afp;
1366
1367 /* If this is a whitelist, we first block the address families that are out of range and then
1368 * everything that is not in the set. First, we find the lowest and highest address family in
1369 * the set. */
1370
1371 SET_FOREACH(afp, address_families, i) {
1372 af = PTR_TO_INT(afp);
1373
1374 if (af <= 0 || af >= af_max())
1375 continue;
1376
1377 if (first == 0 || af < first)
1378 first = af;
1379
1380 if (last == 0 || af > last)
1381 last = af;
1382 }
1383
1384 assert((first == 0) == (last == 0));
1385
1386 if (first == 0) {
1387
1388 /* No entries in the valid range, block everything */
1389 r = seccomp_rule_add_exact(
1390 seccomp,
1391 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1392 SCMP_SYS(socket),
1393 0);
1394 if (r < 0) {
1395 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1396 continue;
1397 }
1398
1399 } else {
1400
1401 /* Block everything below the first entry */
1402 r = seccomp_rule_add_exact(
1403 seccomp,
1404 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1405 SCMP_SYS(socket),
1406 1,
1407 SCMP_A0(SCMP_CMP_LT, first));
1408 if (r < 0) {
1409 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1410 continue;
1411 }
1412
1413 /* Block everything above the last entry */
1414 r = seccomp_rule_add_exact(
1415 seccomp,
1416 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1417 SCMP_SYS(socket),
1418 1,
1419 SCMP_A0(SCMP_CMP_GT, last));
1420 if (r < 0) {
1421 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1422 continue;
1423 }
1424
1425 /* Block everything between the first and last entry */
1426 for (af = 1; af < af_max(); af++) {
1427
1428 if (set_contains(address_families, INT_TO_PTR(af)))
1429 continue;
1430
1431 r = seccomp_rule_add_exact(
1432 seccomp,
1433 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1434 SCMP_SYS(socket),
1435 1,
1436 SCMP_A0(SCMP_CMP_EQ, af));
1437 if (r < 0)
1438 break;
1439 }
1440 if (r < 0) {
1441 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1442 continue;
1443 }
1444 }
1445
1446 } else {
1447 void *af;
1448
1449 /* If this is a blacklist, then generate one rule for
1450 * each address family that are then combined in OR
1451 * checks. */
1452
1453 SET_FOREACH(af, address_families, i) {
1454
1455 r = seccomp_rule_add_exact(
1456 seccomp,
1457 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1458 SCMP_SYS(socket),
1459 1,
1460 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1461 if (r < 0)
1462 break;
1463 }
1464 if (r < 0) {
1465 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1466 continue;
1467 }
1468 }
1469
1470 r = seccomp_load(seccomp);
1471 if (ERRNO_IS_SECCOMP_FATAL(r))
1472 return r;
1473 if (r < 0)
1474 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1475 }
1476
1477 return 0;
1478 }
1479
1480 int seccomp_restrict_realtime(void) {
1481 static const int permitted_policies[] = {
1482 SCHED_OTHER,
1483 SCHED_BATCH,
1484 SCHED_IDLE,
1485 };
1486
1487 int r, max_policy = 0;
1488 uint32_t arch;
1489 unsigned i;
1490
1491 /* Determine the highest policy constant we want to allow */
1492 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1493 if (permitted_policies[i] > max_policy)
1494 max_policy = permitted_policies[i];
1495
1496 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1497 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1498 int p;
1499
1500 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1501
1502 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1503 if (r < 0)
1504 return r;
1505
1506 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1507 * whitelist. */
1508 for (p = 0; p < max_policy; p++) {
1509 bool good = false;
1510
1511 /* Check if this is in the whitelist. */
1512 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1513 if (permitted_policies[i] == p) {
1514 good = true;
1515 break;
1516 }
1517
1518 if (good)
1519 continue;
1520
1521 /* Deny this policy */
1522 r = seccomp_rule_add_exact(
1523 seccomp,
1524 SCMP_ACT_ERRNO(EPERM),
1525 SCMP_SYS(sched_setscheduler),
1526 1,
1527 SCMP_A1(SCMP_CMP_EQ, p));
1528 if (r < 0) {
1529 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1530 continue;
1531 }
1532 }
1533
1534 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1535 * unsigned here, hence no need no check for < 0 values. */
1536 r = seccomp_rule_add_exact(
1537 seccomp,
1538 SCMP_ACT_ERRNO(EPERM),
1539 SCMP_SYS(sched_setscheduler),
1540 1,
1541 SCMP_A1(SCMP_CMP_GT, max_policy));
1542 if (r < 0) {
1543 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1544 continue;
1545 }
1546
1547 r = seccomp_load(seccomp);
1548 if (ERRNO_IS_SECCOMP_FATAL(r))
1549 return r;
1550 if (r < 0)
1551 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1552 }
1553
1554 return 0;
1555 }
1556
1557 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1558 uint32_t arch,
1559 int nr,
1560 unsigned arg_cnt,
1561 const struct scmp_arg_cmp arg) {
1562 int r;
1563
1564 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1565 if (r < 0) {
1566 _cleanup_free_ char *n = NULL;
1567
1568 n = seccomp_syscall_resolve_num_arch(arch, nr);
1569 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1570 strna(n),
1571 seccomp_arch_to_string(arch));
1572 }
1573
1574 return r;
1575 }
1576
1577 /* For known architectures, check that syscalls are indeed defined or not. */
1578 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1579 assert_cc(SCMP_SYS(shmget) > 0);
1580 assert_cc(SCMP_SYS(shmat) > 0);
1581 assert_cc(SCMP_SYS(shmdt) > 0);
1582 #endif
1583
1584 int seccomp_memory_deny_write_execute(void) {
1585 uint32_t arch;
1586 int r;
1587
1588 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1589 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1590 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1591
1592 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1593
1594 switch (arch) {
1595
1596 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1597 * We ignore that here, which means there's still a way to get writable/executable
1598 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1599
1600 case SCMP_ARCH_X86:
1601 case SCMP_ARCH_S390:
1602 filter_syscall = SCMP_SYS(mmap2);
1603 block_syscall = SCMP_SYS(mmap);
1604 /* shmat multiplexed, see above */
1605 break;
1606
1607 case SCMP_ARCH_PPC:
1608 case SCMP_ARCH_PPC64:
1609 case SCMP_ARCH_PPC64LE:
1610 case SCMP_ARCH_S390X:
1611 filter_syscall = SCMP_SYS(mmap);
1612 /* shmat multiplexed, see above */
1613 break;
1614
1615 case SCMP_ARCH_ARM:
1616 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1617 shmat_syscall = SCMP_SYS(shmat);
1618 break;
1619
1620 case SCMP_ARCH_X86_64:
1621 case SCMP_ARCH_X32:
1622 case SCMP_ARCH_AARCH64:
1623 filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
1624 shmat_syscall = SCMP_SYS(shmat);
1625 break;
1626
1627 /* Please add more definitions here, if you port systemd to other architectures! */
1628
1629 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
1630 #warning "Consider adding the right mmap() syscall definitions here!"
1631 #endif
1632 }
1633
1634 /* Can't filter mmap() on this arch, then skip it */
1635 if (filter_syscall == 0)
1636 continue;
1637
1638 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1639 if (r < 0)
1640 return r;
1641
1642 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1643 1,
1644 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1645 if (r < 0)
1646 continue;
1647
1648 if (block_syscall != 0) {
1649 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1650 if (r < 0)
1651 continue;
1652 }
1653
1654 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1655 1,
1656 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1657 if (r < 0)
1658 continue;
1659
1660 #ifdef __NR_pkey_mprotect
1661 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1662 1,
1663 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1664 if (r < 0)
1665 continue;
1666 #endif
1667
1668 if (shmat_syscall > 0) {
1669 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1670 1,
1671 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1672 if (r < 0)
1673 continue;
1674 }
1675
1676 r = seccomp_load(seccomp);
1677 if (ERRNO_IS_SECCOMP_FATAL(r))
1678 return r;
1679 if (r < 0)
1680 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1681 }
1682
1683 return 0;
1684 }
1685
1686 int seccomp_restrict_archs(Set *archs) {
1687 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1688 Iterator i;
1689 void *id;
1690 int r;
1691
1692 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1693 * list.
1694 *
1695 * There are some qualifications. However the most important use is to stop processes from bypassing
1696 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1697 * in a non-native architecture. There are no holes in this use case, at least so far. */
1698
1699 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1700 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1701 * to run a program with the restrictions applied. */
1702 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1703 if (!seccomp)
1704 return -ENOMEM;
1705
1706 SET_FOREACH(id, archs, i) {
1707 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1708 if (r < 0 && r != -EEXIST)
1709 return r;
1710 }
1711
1712 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1713 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1714 * The important thing is that you can block the old 32-bit x86 syscalls.
1715 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1716
1717 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1718 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1719
1720 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1721 if (r < 0 && r != -EEXIST)
1722 return r;
1723 }
1724
1725 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1726 if (r < 0)
1727 return r;
1728
1729 r = seccomp_load(seccomp);
1730 if (ERRNO_IS_SECCOMP_FATAL(r))
1731 return r;
1732 if (r < 0)
1733 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1734
1735 return 0;
1736 }
1737
1738 int parse_syscall_archs(char **l, Set **archs) {
1739 _cleanup_set_free_ Set *_archs;
1740 char **s;
1741 int r;
1742
1743 assert(l);
1744 assert(archs);
1745
1746 r = set_ensure_allocated(&_archs, NULL);
1747 if (r < 0)
1748 return r;
1749
1750 STRV_FOREACH(s, l) {
1751 uint32_t a;
1752
1753 r = seccomp_arch_from_string(*s, &a);
1754 if (r < 0)
1755 return -EINVAL;
1756
1757 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1758 if (r < 0)
1759 return -ENOMEM;
1760 }
1761
1762 *archs = TAKE_PTR(_archs);
1763
1764 return 0;
1765 }
1766
1767 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1768 const char *i;
1769 int r;
1770
1771 assert(set);
1772
1773 NULSTR_FOREACH(i, set->value) {
1774
1775 if (i[0] == '@') {
1776 const SyscallFilterSet *more;
1777
1778 more = syscall_filter_set_find(i);
1779 if (!more)
1780 return -ENXIO;
1781
1782 r = seccomp_filter_set_add(filter, add, more);
1783 if (r < 0)
1784 return r;
1785 } else {
1786 int id;
1787
1788 id = seccomp_syscall_resolve_name(i);
1789 if (id == __NR_SCMP_ERROR) {
1790 log_debug("Couldn't resolve system call, ignoring: %s", i);
1791 continue;
1792 }
1793
1794 if (add) {
1795 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1796 if (r < 0)
1797 return r;
1798 } else
1799 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1800 }
1801 }
1802
1803 return 0;
1804 }
1805
1806 int seccomp_lock_personality(unsigned long personality) {
1807 uint32_t arch;
1808 int r;
1809
1810 if (personality >= PERSONALITY_INVALID)
1811 return -EINVAL;
1812
1813 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1814 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1815
1816 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1817 if (r < 0)
1818 return r;
1819
1820 r = seccomp_rule_add_exact(
1821 seccomp,
1822 SCMP_ACT_ERRNO(EPERM),
1823 SCMP_SYS(personality),
1824 1,
1825 SCMP_A0(SCMP_CMP_NE, personality));
1826 if (r < 0) {
1827 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1828 continue;
1829 }
1830
1831 r = seccomp_load(seccomp);
1832 if (ERRNO_IS_SECCOMP_FATAL(r))
1833 return r;
1834 if (r < 0)
1835 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1836 }
1837
1838 return 0;
1839 }
1840
1841 int seccomp_protect_hostname(void) {
1842 uint32_t arch;
1843 int r;
1844
1845 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1846 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1847
1848 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1849 if (r < 0)
1850 return r;
1851
1852 r = seccomp_rule_add_exact(
1853 seccomp,
1854 SCMP_ACT_ERRNO(EPERM),
1855 SCMP_SYS(sethostname),
1856 0);
1857 if (r < 0) {
1858 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1859 continue;
1860 }
1861
1862 r = seccomp_rule_add_exact(
1863 seccomp,
1864 SCMP_ACT_ERRNO(EPERM),
1865 SCMP_SYS(setdomainname),
1866 0);
1867 if (r < 0) {
1868 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1869 continue;
1870 }
1871
1872 r = seccomp_load(seccomp);
1873 if (ERRNO_IS_SECCOMP_FATAL(r))
1874 return r;
1875 if (r < 0)
1876 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1877 }
1878
1879 return 0;
1880 }
1881
1882 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1883 /* Checks the mode_t parameter of the following system calls:
1884 *
1885 * → chmod() + fchmod() + fchmodat()
1886 * → open() + creat() + openat()
1887 * → mkdir() + mkdirat()
1888 * → mknod() + mknodat()
1889 *
1890 * Returns error if *everything* failed, and 0 otherwise.
1891 */
1892 int r = 0;
1893 bool any = false;
1894
1895 r = seccomp_rule_add_exact(
1896 seccomp,
1897 SCMP_ACT_ERRNO(EPERM),
1898 SCMP_SYS(chmod),
1899 1,
1900 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1901 if (r < 0)
1902 log_debug_errno(r, "Failed to add filter for chmod: %m");
1903 else
1904 any = true;
1905
1906 r = seccomp_rule_add_exact(
1907 seccomp,
1908 SCMP_ACT_ERRNO(EPERM),
1909 SCMP_SYS(fchmod),
1910 1,
1911 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1912 if (r < 0)
1913 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1914 else
1915 any = true;
1916
1917 r = seccomp_rule_add_exact(
1918 seccomp,
1919 SCMP_ACT_ERRNO(EPERM),
1920 SCMP_SYS(fchmodat),
1921 1,
1922 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1923 if (r < 0)
1924 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1925 else
1926 any = true;
1927
1928 r = seccomp_rule_add_exact(
1929 seccomp,
1930 SCMP_ACT_ERRNO(EPERM),
1931 SCMP_SYS(mkdir),
1932 1,
1933 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1934 if (r < 0)
1935 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1936 else
1937 any = true;
1938
1939 r = seccomp_rule_add_exact(
1940 seccomp,
1941 SCMP_ACT_ERRNO(EPERM),
1942 SCMP_SYS(mkdirat),
1943 1,
1944 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1945 if (r < 0)
1946 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1947 else
1948 any = true;
1949
1950 r = seccomp_rule_add_exact(
1951 seccomp,
1952 SCMP_ACT_ERRNO(EPERM),
1953 SCMP_SYS(mknod),
1954 1,
1955 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1956 if (r < 0)
1957 log_debug_errno(r, "Failed to add filter for mknod: %m");
1958 else
1959 any = true;
1960
1961 r = seccomp_rule_add_exact(
1962 seccomp,
1963 SCMP_ACT_ERRNO(EPERM),
1964 SCMP_SYS(mknodat),
1965 1,
1966 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1967 if (r < 0)
1968 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1969 else
1970 any = true;
1971
1972 #if SCMP_SYS(open) > 0
1973 r = seccomp_rule_add_exact(
1974 seccomp,
1975 SCMP_ACT_ERRNO(EPERM),
1976 SCMP_SYS(open),
1977 2,
1978 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1979 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1980 if (r < 0)
1981 log_debug_errno(r, "Failed to add filter for open: %m");
1982 else
1983 any = true;
1984 #endif
1985
1986 r = seccomp_rule_add_exact(
1987 seccomp,
1988 SCMP_ACT_ERRNO(EPERM),
1989 SCMP_SYS(openat),
1990 2,
1991 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1992 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
1993 if (r < 0)
1994 log_debug_errno(r, "Failed to add filter for openat: %m");
1995 else
1996 any = true;
1997
1998 r = seccomp_rule_add_exact(
1999 seccomp,
2000 SCMP_ACT_ERRNO(EPERM),
2001 SCMP_SYS(creat),
2002 1,
2003 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2004 if (r < 0)
2005 log_debug_errno(r, "Failed to add filter for creat: %m");
2006 else
2007 any = true;
2008
2009 return any ? 0 : r;
2010 }
2011
2012 int seccomp_restrict_suid_sgid(void) {
2013 uint32_t arch;
2014 int r, k;
2015
2016 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2017 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2018
2019 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2020 if (r < 0)
2021 return r;
2022
2023 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2024 if (r < 0)
2025 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2026
2027 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2028 if (k < 0)
2029 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2030
2031 if (r < 0 && k < 0)
2032 continue;
2033
2034 r = seccomp_load(seccomp);
2035 if (ERRNO_IS_SECCOMP_FATAL(r))
2036 return r;
2037 if (r < 0)
2038 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2039 }
2040
2041 return 0;
2042 }
2043
2044 uint32_t scmp_act_kill_process(void) {
2045
2046 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2047 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2048 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2049 * for single-threaded apps does the right thing. */
2050
2051 #ifdef SCMP_ACT_KILL_PROCESS
2052 if (seccomp_api_get() >= 3)
2053 return SCMP_ACT_KILL_PROCESS;
2054 #endif
2055
2056 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2057 }