]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #15768 from poettering/grnd-insecure
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <seccomp.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/prctl.h>
10 #include <sys/shm.h>
11 #include <sys/stat.h>
12
13 #include "af-list.h"
14 #include "alloc-util.h"
15 #include "errno-list.h"
16 #include "macro.h"
17 #include "nsflags.h"
18 #include "nulstr-util.h"
19 #include "process-util.h"
20 #include "seccomp-util.h"
21 #include "set.h"
22 #include "string-util.h"
23 #include "strv.h"
24
25 const uint32_t seccomp_local_archs[] = {
26
27 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
28
29 #if defined(__x86_64__) && defined(__ILP32__)
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
32 SCMP_ARCH_X32, /* native */
33 #elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
35 SCMP_ARCH_X32,
36 SCMP_ARCH_X86_64, /* native */
37 #elif defined(__i386__)
38 SCMP_ARCH_X86,
39 #elif defined(__aarch64__)
40 SCMP_ARCH_ARM,
41 SCMP_ARCH_AARCH64, /* native */
42 #elif defined(__arm__)
43 SCMP_ARCH_ARM,
44 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
48 SCMP_ARCH_MIPS,
49 SCMP_ARCH_MIPSEL, /* native */
50 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
54 SCMP_ARCH_MIPS64N32,
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
67 SCMP_ARCH_MIPSEL64,
68 SCMP_ARCH_MIPS64,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64N32, /* native */
71 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
79 SCMP_ARCH_PPC,
80 SCMP_ARCH_PPC64LE,
81 SCMP_ARCH_PPC64, /* native */
82 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86 #elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88 #elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91 #elif defined(__s390__)
92 SCMP_ARCH_S390,
93 #endif
94 (uint32_t) -1
95 };
96
97 const char* seccomp_arch_to_string(uint32_t c) {
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
102
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
105 return "native";
106 case SCMP_ARCH_X86:
107 return "x86";
108 case SCMP_ARCH_X86_64:
109 return "x86-64";
110 case SCMP_ARCH_X32:
111 return "x32";
112 case SCMP_ARCH_ARM:
113 return "arm";
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
135 return "s390";
136 case SCMP_ARCH_S390X:
137 return "s390x";
138 default:
139 return NULL;
140 }
141 }
142
143 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
183 else
184 return -EINVAL;
185
186 return 0;
187 }
188
189 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
190 scmp_filter_ctx seccomp;
191 int r;
192
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
204 if (r < 0)
205 goto finish;
206
207 r = seccomp_arch_add(seccomp, arch);
208 if (r < 0)
209 goto finish;
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
225 goto finish;
226
227 *ret = seccomp;
228 return 0;
229
230 finish:
231 seccomp_release(seccomp);
232 return r;
233 }
234
235 static bool is_basic_seccomp_available(void) {
236 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
237 }
238
239 static bool is_seccomp_filter_available(void) {
240 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 errno == EFAULT;
242 }
243
244 bool is_seccomp_available(void) {
245 static int cached_enabled = -1;
246
247 if (cached_enabled < 0)
248 cached_enabled =
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
251
252 return cached_enabled;
253 }
254
255 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
256 [SYSCALL_FILTER_SET_DEFAULT] = {
257 .name = "@default",
258 .help = "System calls that are always permitted",
259 .value =
260 "clock_getres\0"
261 "clock_getres_time64\0"
262 "clock_gettime\0"
263 "clock_gettime64\0"
264 "clock_nanosleep\0"
265 "clock_nanosleep_time64\0"
266 "execve\0"
267 "exit\0"
268 "exit_group\0"
269 "futex\0"
270 "futex_time64\0"
271 "get_robust_list\0"
272 "get_thread_area\0"
273 "getegid\0"
274 "getegid32\0"
275 "geteuid\0"
276 "geteuid32\0"
277 "getgid\0"
278 "getgid32\0"
279 "getgroups\0"
280 "getgroups32\0"
281 "getpgid\0"
282 "getpgrp\0"
283 "getpid\0"
284 "getppid\0"
285 "getresgid\0"
286 "getresgid32\0"
287 "getresuid\0"
288 "getresuid32\0"
289 "getrlimit\0" /* make sure processes can query stack size and such */
290 "getsid\0"
291 "gettid\0"
292 "gettimeofday\0"
293 "getuid\0"
294 "getuid32\0"
295 "membarrier\0"
296 "nanosleep\0"
297 "pause\0"
298 "prlimit64\0"
299 "restart_syscall\0"
300 "rseq\0"
301 "rt_sigreturn\0"
302 "sched_yield\0"
303 "set_robust_list\0"
304 "set_thread_area\0"
305 "set_tid_address\0"
306 "set_tls\0"
307 "sigreturn\0"
308 "time\0"
309 "ugetrlimit\0"
310 },
311 [SYSCALL_FILTER_SET_AIO] = {
312 .name = "@aio",
313 .help = "Asynchronous IO",
314 .value =
315 "io_cancel\0"
316 "io_destroy\0"
317 "io_getevents\0"
318 "io_pgetevents\0"
319 "io_pgetevents_time64\0"
320 "io_setup\0"
321 "io_submit\0"
322 "io_uring_enter\0"
323 "io_uring_register\0"
324 "io_uring_setup\0"
325 },
326 [SYSCALL_FILTER_SET_BASIC_IO] = {
327 .name = "@basic-io",
328 .help = "Basic IO",
329 .value =
330 "_llseek\0"
331 "close\0"
332 "dup\0"
333 "dup2\0"
334 "dup3\0"
335 "lseek\0"
336 "pread64\0"
337 "preadv\0"
338 "preadv2\0"
339 "pwrite64\0"
340 "pwritev\0"
341 "pwritev2\0"
342 "read\0"
343 "readv\0"
344 "write\0"
345 "writev\0"
346 },
347 [SYSCALL_FILTER_SET_CHOWN] = {
348 .name = "@chown",
349 .help = "Change ownership of files and directories",
350 .value =
351 "chown\0"
352 "chown32\0"
353 "fchown\0"
354 "fchown32\0"
355 "fchownat\0"
356 "lchown\0"
357 "lchown32\0"
358 },
359 [SYSCALL_FILTER_SET_CLOCK] = {
360 .name = "@clock",
361 .help = "Change the system time",
362 .value =
363 "adjtimex\0"
364 "clock_adjtime\0"
365 "clock_adjtime64\0"
366 "clock_settime\0"
367 "clock_settime64\0"
368 "settimeofday\0"
369 "stime\0"
370 },
371 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
372 .name = "@cpu-emulation",
373 .help = "System calls for CPU emulation functionality",
374 .value =
375 "modify_ldt\0"
376 "subpage_prot\0"
377 "switch_endian\0"
378 "vm86\0"
379 "vm86old\0"
380 },
381 [SYSCALL_FILTER_SET_DEBUG] = {
382 .name = "@debug",
383 .help = "Debugging, performance monitoring and tracing functionality",
384 .value =
385 "lookup_dcookie\0"
386 "perf_event_open\0"
387 "pidfd_getfd\0"
388 "ptrace\0"
389 "rtas\0"
390 #ifdef __NR_s390_runtime_instr
391 "s390_runtime_instr\0"
392 #endif
393 "sys_debug_setcontext\0"
394 },
395 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
396 .name = "@file-system",
397 .help = "File system operations",
398 .value =
399 "access\0"
400 "chdir\0"
401 "chmod\0"
402 "close\0"
403 "creat\0"
404 "faccessat\0"
405 "fallocate\0"
406 "fchdir\0"
407 "fchmod\0"
408 "fchmodat\0"
409 "fcntl\0"
410 "fcntl64\0"
411 "fgetxattr\0"
412 "flistxattr\0"
413 "fremovexattr\0"
414 "fsetxattr\0"
415 "fstat\0"
416 "fstat64\0"
417 "fstatat64\0"
418 "fstatfs\0"
419 "fstatfs64\0"
420 "ftruncate\0"
421 "ftruncate64\0"
422 "futimesat\0"
423 "getcwd\0"
424 "getdents\0"
425 "getdents64\0"
426 "getxattr\0"
427 "inotify_add_watch\0"
428 "inotify_init\0"
429 "inotify_init1\0"
430 "inotify_rm_watch\0"
431 "lgetxattr\0"
432 "link\0"
433 "linkat\0"
434 "listxattr\0"
435 "llistxattr\0"
436 "lremovexattr\0"
437 "lsetxattr\0"
438 "lstat\0"
439 "lstat64\0"
440 "mkdir\0"
441 "mkdirat\0"
442 "mknod\0"
443 "mknodat\0"
444 "mmap\0"
445 "mmap2\0"
446 "munmap\0"
447 "newfstatat\0"
448 "oldfstat\0"
449 "oldlstat\0"
450 "oldstat\0"
451 "open\0"
452 "openat\0"
453 "openat2\0"
454 "readlink\0"
455 "readlinkat\0"
456 "removexattr\0"
457 "rename\0"
458 "renameat\0"
459 "renameat2\0"
460 "rmdir\0"
461 "setxattr\0"
462 "stat\0"
463 "stat64\0"
464 "statfs\0"
465 "statfs64\0"
466 #ifdef __NR_statx
467 "statx\0"
468 #endif
469 "symlink\0"
470 "symlinkat\0"
471 "truncate\0"
472 "truncate64\0"
473 "unlink\0"
474 "unlinkat\0"
475 "utime\0"
476 "utimensat\0"
477 "utimensat_time64\0"
478 "utimes\0"
479 },
480 [SYSCALL_FILTER_SET_IO_EVENT] = {
481 .name = "@io-event",
482 .help = "Event loop system calls",
483 .value =
484 "_newselect\0"
485 "epoll_create\0"
486 "epoll_create1\0"
487 "epoll_ctl\0"
488 "epoll_ctl_old\0"
489 "epoll_pwait\0"
490 "epoll_wait\0"
491 "epoll_wait_old\0"
492 "eventfd\0"
493 "eventfd2\0"
494 "poll\0"
495 "ppoll\0"
496 "ppoll_time64\0"
497 "pselect6\0"
498 "pselect6_time64\0"
499 "select\0"
500 },
501 [SYSCALL_FILTER_SET_IPC] = {
502 .name = "@ipc",
503 .help = "SysV IPC, POSIX Message Queues or other IPC",
504 .value =
505 "ipc\0"
506 "memfd_create\0"
507 "mq_getsetattr\0"
508 "mq_notify\0"
509 "mq_open\0"
510 "mq_timedreceive\0"
511 "mq_timedreceive_time64\0"
512 "mq_timedsend\0"
513 "mq_timedsend_time64\0"
514 "mq_unlink\0"
515 "msgctl\0"
516 "msgget\0"
517 "msgrcv\0"
518 "msgsnd\0"
519 "pipe\0"
520 "pipe2\0"
521 "process_vm_readv\0"
522 "process_vm_writev\0"
523 "semctl\0"
524 "semget\0"
525 "semop\0"
526 "semtimedop\0"
527 "semtimedop_time64\0"
528 "shmat\0"
529 "shmctl\0"
530 "shmdt\0"
531 "shmget\0"
532 },
533 [SYSCALL_FILTER_SET_KEYRING] = {
534 .name = "@keyring",
535 .help = "Kernel keyring access",
536 .value =
537 "add_key\0"
538 "keyctl\0"
539 "request_key\0"
540 },
541 [SYSCALL_FILTER_SET_MEMLOCK] = {
542 .name = "@memlock",
543 .help = "Memory locking control",
544 .value =
545 "mlock\0"
546 "mlock2\0"
547 "mlockall\0"
548 "munlock\0"
549 "munlockall\0"
550 },
551 [SYSCALL_FILTER_SET_MODULE] = {
552 .name = "@module",
553 .help = "Loading and unloading of kernel modules",
554 .value =
555 "delete_module\0"
556 "finit_module\0"
557 "init_module\0"
558 },
559 [SYSCALL_FILTER_SET_MOUNT] = {
560 .name = "@mount",
561 .help = "Mounting and unmounting of file systems",
562 .value =
563 "chroot\0"
564 "fsconfig\0"
565 "fsmount\0"
566 "fsopen\0"
567 "fspick\0"
568 "mount\0"
569 "move_mount\0"
570 "open_tree\0"
571 "pivot_root\0"
572 "umount\0"
573 "umount2\0"
574 },
575 [SYSCALL_FILTER_SET_NETWORK_IO] = {
576 .name = "@network-io",
577 .help = "Network or Unix socket IO, should not be needed if not network facing",
578 .value =
579 "accept\0"
580 "accept4\0"
581 "bind\0"
582 "connect\0"
583 "getpeername\0"
584 "getsockname\0"
585 "getsockopt\0"
586 "listen\0"
587 "recv\0"
588 "recvfrom\0"
589 "recvmmsg\0"
590 "recvmmsg_time64\0"
591 "recvmsg\0"
592 "send\0"
593 "sendmmsg\0"
594 "sendmsg\0"
595 "sendto\0"
596 "setsockopt\0"
597 "shutdown\0"
598 "socket\0"
599 "socketcall\0"
600 "socketpair\0"
601 },
602 [SYSCALL_FILTER_SET_OBSOLETE] = {
603 /* some unknown even to libseccomp */
604 .name = "@obsolete",
605 .help = "Unusual, obsolete or unimplemented system calls",
606 .value =
607 "_sysctl\0"
608 "afs_syscall\0"
609 "bdflush\0"
610 "break\0"
611 "create_module\0"
612 "ftime\0"
613 "get_kernel_syms\0"
614 "getpmsg\0"
615 "gtty\0"
616 "idle\0"
617 "lock\0"
618 "mpx\0"
619 "prof\0"
620 "profil\0"
621 "putpmsg\0"
622 "query_module\0"
623 "security\0"
624 "sgetmask\0"
625 "ssetmask\0"
626 "stty\0"
627 "sysfs\0"
628 "tuxcall\0"
629 "ulimit\0"
630 "uselib\0"
631 "ustat\0"
632 "vserver\0"
633 },
634 [SYSCALL_FILTER_SET_PKEY] = {
635 .name = "@pkey",
636 .help = "System calls used for memory protection keys",
637 .value =
638 "pkey_alloc\0"
639 "pkey_free\0"
640 "pkey_mprotect\0"
641 },
642 [SYSCALL_FILTER_SET_PRIVILEGED] = {
643 .name = "@privileged",
644 .help = "All system calls which need super-user capabilities",
645 .value =
646 "@chown\0"
647 "@clock\0"
648 "@module\0"
649 "@raw-io\0"
650 "@reboot\0"
651 "@swap\0"
652 "_sysctl\0"
653 "acct\0"
654 "bpf\0"
655 "capset\0"
656 "chroot\0"
657 "fanotify_init\0"
658 "fanotify_mark\0"
659 "nfsservctl\0"
660 "open_by_handle_at\0"
661 "pivot_root\0"
662 "quotactl\0"
663 "setdomainname\0"
664 "setfsuid\0"
665 "setfsuid32\0"
666 "setgroups\0"
667 "setgroups32\0"
668 "sethostname\0"
669 "setresuid\0"
670 "setresuid32\0"
671 "setreuid\0"
672 "setreuid32\0"
673 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
674 "setuid32\0"
675 "vhangup\0"
676 },
677 [SYSCALL_FILTER_SET_PROCESS] = {
678 .name = "@process",
679 .help = "Process control, execution, namespaceing operations",
680 .value =
681 "arch_prctl\0"
682 "capget\0" /* Able to query arbitrary processes */
683 "clone\0"
684 "clone3\0"
685 "execveat\0"
686 "fork\0"
687 "getrusage\0"
688 "kill\0"
689 "pidfd_open\0"
690 "pidfd_send_signal\0"
691 "prctl\0"
692 "rt_sigqueueinfo\0"
693 "rt_tgsigqueueinfo\0"
694 "setns\0"
695 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
696 "tgkill\0"
697 "times\0"
698 "tkill\0"
699 "unshare\0"
700 "vfork\0"
701 "wait4\0"
702 "waitid\0"
703 "waitpid\0"
704 },
705 [SYSCALL_FILTER_SET_RAW_IO] = {
706 .name = "@raw-io",
707 .help = "Raw I/O port access",
708 .value =
709 "ioperm\0"
710 "iopl\0"
711 "pciconfig_iobase\0"
712 "pciconfig_read\0"
713 "pciconfig_write\0"
714 #ifdef __NR_s390_pci_mmio_read
715 "s390_pci_mmio_read\0"
716 #endif
717 #ifdef __NR_s390_pci_mmio_write
718 "s390_pci_mmio_write\0"
719 #endif
720 },
721 [SYSCALL_FILTER_SET_REBOOT] = {
722 .name = "@reboot",
723 .help = "Reboot and reboot preparation/kexec",
724 .value =
725 "kexec_file_load\0"
726 "kexec_load\0"
727 "reboot\0"
728 },
729 [SYSCALL_FILTER_SET_RESOURCES] = {
730 .name = "@resources",
731 .help = "Alter resource settings",
732 .value =
733 "ioprio_set\0"
734 "mbind\0"
735 "migrate_pages\0"
736 "move_pages\0"
737 "nice\0"
738 "sched_setaffinity\0"
739 "sched_setattr\0"
740 "sched_setparam\0"
741 "sched_setscheduler\0"
742 "set_mempolicy\0"
743 "setpriority\0"
744 "setrlimit\0"
745 },
746 [SYSCALL_FILTER_SET_SETUID] = {
747 .name = "@setuid",
748 .help = "Operations for changing user/group credentials",
749 .value =
750 "setgid\0"
751 "setgid32\0"
752 "setgroups\0"
753 "setgroups32\0"
754 "setregid\0"
755 "setregid32\0"
756 "setresgid\0"
757 "setresgid32\0"
758 "setresuid\0"
759 "setresuid32\0"
760 "setreuid\0"
761 "setreuid32\0"
762 "setuid\0"
763 "setuid32\0"
764 },
765 [SYSCALL_FILTER_SET_SIGNAL] = {
766 .name = "@signal",
767 .help = "Process signal handling",
768 .value =
769 "rt_sigaction\0"
770 "rt_sigpending\0"
771 "rt_sigprocmask\0"
772 "rt_sigsuspend\0"
773 "rt_sigtimedwait\0"
774 "rt_sigtimedwait_time64\0"
775 "sigaction\0"
776 "sigaltstack\0"
777 "signal\0"
778 "signalfd\0"
779 "signalfd4\0"
780 "sigpending\0"
781 "sigprocmask\0"
782 "sigsuspend\0"
783 },
784 [SYSCALL_FILTER_SET_SWAP] = {
785 .name = "@swap",
786 .help = "Enable/disable swap devices",
787 .value =
788 "swapoff\0"
789 "swapon\0"
790 },
791 [SYSCALL_FILTER_SET_SYNC] = {
792 .name = "@sync",
793 .help = "Synchronize files and memory to storage",
794 .value =
795 "fdatasync\0"
796 "fsync\0"
797 "msync\0"
798 "sync\0"
799 "sync_file_range\0"
800 "sync_file_range2\0"
801 "syncfs\0"
802 },
803 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
804 .name = "@system-service",
805 .help = "General system service operations",
806 .value =
807 "@aio\0"
808 "@basic-io\0"
809 "@chown\0"
810 "@default\0"
811 "@file-system\0"
812 "@io-event\0"
813 "@ipc\0"
814 "@keyring\0"
815 "@memlock\0"
816 "@network-io\0"
817 "@process\0"
818 "@resources\0"
819 "@setuid\0"
820 "@signal\0"
821 "@sync\0"
822 "@timer\0"
823 "brk\0"
824 "capget\0"
825 "capset\0"
826 "copy_file_range\0"
827 "fadvise64\0"
828 "fadvise64_64\0"
829 "flock\0"
830 "get_mempolicy\0"
831 "getcpu\0"
832 "getpriority\0"
833 "getrandom\0"
834 "ioctl\0"
835 "ioprio_get\0"
836 "kcmp\0"
837 "madvise\0"
838 "mprotect\0"
839 "mremap\0"
840 "name_to_handle_at\0"
841 "oldolduname\0"
842 "olduname\0"
843 "personality\0"
844 "readahead\0"
845 "readdir\0"
846 "remap_file_pages\0"
847 "sched_get_priority_max\0"
848 "sched_get_priority_min\0"
849 "sched_getaffinity\0"
850 "sched_getattr\0"
851 "sched_getparam\0"
852 "sched_getscheduler\0"
853 "sched_rr_get_interval\0"
854 "sched_rr_get_interval_time64\0"
855 "sched_yield\0"
856 "sendfile\0"
857 "sendfile64\0"
858 "setfsgid\0"
859 "setfsgid32\0"
860 "setfsuid\0"
861 "setfsuid32\0"
862 "setpgid\0"
863 "setsid\0"
864 "splice\0"
865 "sysinfo\0"
866 "tee\0"
867 "umask\0"
868 "uname\0"
869 "userfaultfd\0"
870 "vmsplice\0"
871 },
872 [SYSCALL_FILTER_SET_TIMER] = {
873 .name = "@timer",
874 .help = "Schedule operations by time",
875 .value =
876 "alarm\0"
877 "getitimer\0"
878 "setitimer\0"
879 "timer_create\0"
880 "timer_delete\0"
881 "timer_getoverrun\0"
882 "timer_gettime\0"
883 "timer_gettime64\0"
884 "timer_settime\0"
885 "timer_settime64\0"
886 "timerfd_create\0"
887 "timerfd_gettime\0"
888 "timerfd_gettime64\0"
889 "timerfd_settime\0"
890 "timerfd_settime64\0"
891 "times\0"
892 },
893 };
894
895 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
896 unsigned i;
897
898 if (isempty(name) || name[0] != '@')
899 return NULL;
900
901 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
902 if (streq(syscall_filter_sets[i].name, name))
903 return syscall_filter_sets + i;
904
905 return NULL;
906 }
907
908 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
909
910 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
911 assert(seccomp);
912 assert(name);
913
914 if (strv_contains(exclude, name))
915 return 0;
916
917 if (name[0] == '@') {
918 const SyscallFilterSet *other;
919
920 other = syscall_filter_set_find(name);
921 if (!other)
922 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
923 "Filter set %s is not known!",
924 name);
925
926 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
927
928 } else {
929 int id, r;
930
931 id = seccomp_syscall_resolve_name(name);
932 if (id == __NR_SCMP_ERROR) {
933 if (log_missing)
934 log_debug("System call %s is not known, ignoring.", name);
935 return 0;
936 }
937
938 r = seccomp_rule_add_exact(seccomp, action, id, 0);
939 if (r < 0) {
940 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
941 bool ignore = r == -EDOM;
942
943 if (!ignore || log_missing)
944 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
945 name, id, ignore ? ", ignoring" : "");
946 if (!ignore)
947 return r;
948 }
949
950 return 0;
951 }
952 }
953
954 static int seccomp_add_syscall_filter_set(
955 scmp_filter_ctx seccomp,
956 const SyscallFilterSet *set,
957 uint32_t action,
958 char **exclude,
959 bool log_missing) {
960
961 const char *sys;
962 int r;
963
964 assert(seccomp);
965 assert(set);
966
967 NULSTR_FOREACH(sys, set->value) {
968 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
969 if (r < 0)
970 return r;
971 }
972
973 return 0;
974 }
975
976 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
977 uint32_t arch;
978 int r;
979
980 assert(set);
981
982 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
983 * each local arch. */
984
985 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
986 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
987
988 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
989
990 r = seccomp_init_for_arch(&seccomp, arch, default_action);
991 if (r < 0)
992 return r;
993
994 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
995 if (r < 0)
996 return log_debug_errno(r, "Failed to add filter set: %m");
997
998 r = seccomp_load(seccomp);
999 if (ERRNO_IS_SECCOMP_FATAL(r))
1000 return r;
1001 if (r < 0)
1002 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1003 }
1004
1005 return 0;
1006 }
1007
1008 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
1009 uint32_t arch;
1010 int r;
1011
1012 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1013 * SyscallFilterSet* table. */
1014
1015 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
1016 return 0;
1017
1018 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1019 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1020 Iterator i;
1021 void *syscall_id, *val;
1022
1023 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1024
1025 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1026 if (r < 0)
1027 return r;
1028
1029 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
1030 uint32_t a = action;
1031 int id = PTR_TO_INT(syscall_id) - 1;
1032 int error = PTR_TO_INT(val);
1033
1034 if (action != SCMP_ACT_ALLOW && error >= 0)
1035 a = SCMP_ACT_ERRNO(error);
1036
1037 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1038 if (r < 0) {
1039 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1040 _cleanup_free_ char *n = NULL;
1041 bool ignore;
1042
1043 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1044 ignore = r == -EDOM;
1045 if (!ignore || log_missing)
1046 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1047 strna(n), id, ignore ? ", ignoring" : "");
1048 if (!ignore)
1049 return r;
1050 }
1051 }
1052
1053 r = seccomp_load(seccomp);
1054 if (ERRNO_IS_SECCOMP_FATAL(r))
1055 return r;
1056 if (r < 0)
1057 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1058 }
1059
1060 return 0;
1061 }
1062
1063 int seccomp_parse_syscall_filter(
1064 const char *name,
1065 int errno_num,
1066 Hashmap *filter,
1067 SeccompParseFlags flags,
1068 const char *unit,
1069 const char *filename,
1070 unsigned line) {
1071
1072 int r;
1073
1074 assert(name);
1075 assert(filter);
1076
1077 if (name[0] == '@') {
1078 const SyscallFilterSet *set;
1079 const char *i;
1080
1081 set = syscall_filter_set_find(name);
1082 if (!set) {
1083 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1084 return -EINVAL;
1085
1086 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1087 "Unknown system call group, ignoring: %s", name);
1088 return 0;
1089 }
1090
1091 NULSTR_FOREACH(i, set->value) {
1092 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1093 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1094 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1095 * about them. */
1096 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1097 if (r < 0)
1098 return r;
1099 }
1100 } else {
1101 int id;
1102
1103 id = seccomp_syscall_resolve_name(name);
1104 if (id == __NR_SCMP_ERROR) {
1105 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1106 return -EINVAL;
1107
1108 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1109 "Failed to parse system call, ignoring: %s", name);
1110 return 0;
1111 }
1112
1113 /* If we previously wanted to forbid a syscall and now
1114 * we want to allow it, then remove it from the list. */
1115 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1116 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1117 if (r < 0)
1118 switch (r) {
1119 case -ENOMEM:
1120 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1121 case -EEXIST:
1122 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1123 break;
1124 default:
1125 return r;
1126 }
1127 } else
1128 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1129 }
1130
1131 return 0;
1132 }
1133
1134 int seccomp_restrict_namespaces(unsigned long retain) {
1135 uint32_t arch;
1136 int r;
1137
1138 if (DEBUG_LOGGING) {
1139 _cleanup_free_ char *s = NULL;
1140
1141 (void) namespace_flags_to_string(retain, &s);
1142 log_debug("Restricting namespace to: %s.", strna(s));
1143 }
1144
1145 /* NOOP? */
1146 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1147 return 0;
1148
1149 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1150 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1151 unsigned i;
1152
1153 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1154
1155 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1156 if (r < 0)
1157 return r;
1158
1159 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1160 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1161 * altogether. */
1162 r = seccomp_rule_add_exact(
1163 seccomp,
1164 SCMP_ACT_ERRNO(EPERM),
1165 SCMP_SYS(setns),
1166 0);
1167 else
1168 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1169 * special invocation with a zero flags argument, right here. */
1170 r = seccomp_rule_add_exact(
1171 seccomp,
1172 SCMP_ACT_ERRNO(EPERM),
1173 SCMP_SYS(setns),
1174 1,
1175 SCMP_A1(SCMP_CMP_EQ, 0));
1176 if (r < 0) {
1177 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1178 continue;
1179 }
1180
1181 for (i = 0; namespace_flag_map[i].name; i++) {
1182 unsigned long f;
1183
1184 f = namespace_flag_map[i].flag;
1185 if ((retain & f) == f) {
1186 log_debug("Permitting %s.", namespace_flag_map[i].name);
1187 continue;
1188 }
1189
1190 log_debug("Blocking %s.", namespace_flag_map[i].name);
1191
1192 r = seccomp_rule_add_exact(
1193 seccomp,
1194 SCMP_ACT_ERRNO(EPERM),
1195 SCMP_SYS(unshare),
1196 1,
1197 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1198 if (r < 0) {
1199 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1200 break;
1201 }
1202
1203 /* On s390/s390x the first two parameters to clone are switched */
1204 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1205 r = seccomp_rule_add_exact(
1206 seccomp,
1207 SCMP_ACT_ERRNO(EPERM),
1208 SCMP_SYS(clone),
1209 1,
1210 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1211 else
1212 r = seccomp_rule_add_exact(
1213 seccomp,
1214 SCMP_ACT_ERRNO(EPERM),
1215 SCMP_SYS(clone),
1216 1,
1217 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1218 if (r < 0) {
1219 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1220 break;
1221 }
1222
1223 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1224 r = seccomp_rule_add_exact(
1225 seccomp,
1226 SCMP_ACT_ERRNO(EPERM),
1227 SCMP_SYS(setns),
1228 1,
1229 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1230 if (r < 0) {
1231 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1232 break;
1233 }
1234 }
1235 }
1236 if (r < 0)
1237 continue;
1238
1239 r = seccomp_load(seccomp);
1240 if (ERRNO_IS_SECCOMP_FATAL(r))
1241 return r;
1242 if (r < 0)
1243 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1244 }
1245
1246 return 0;
1247 }
1248
1249 int seccomp_protect_sysctl(void) {
1250 uint32_t arch;
1251 int r;
1252
1253 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1254 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1255
1256 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1257
1258 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1259 /* No _sysctl syscall */
1260 continue;
1261
1262 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1263 if (r < 0)
1264 return r;
1265
1266 r = seccomp_rule_add_exact(
1267 seccomp,
1268 SCMP_ACT_ERRNO(EPERM),
1269 SCMP_SYS(_sysctl),
1270 0);
1271 if (r < 0) {
1272 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1273 continue;
1274 }
1275
1276 r = seccomp_load(seccomp);
1277 if (ERRNO_IS_SECCOMP_FATAL(r))
1278 return r;
1279 if (r < 0)
1280 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1281 }
1282
1283 return 0;
1284 }
1285
1286 int seccomp_protect_syslog(void) {
1287 uint32_t arch;
1288 int r;
1289
1290 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1291 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1292
1293 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1294 if (r < 0)
1295 return r;
1296
1297 r = seccomp_rule_add_exact(
1298 seccomp,
1299 SCMP_ACT_ERRNO(EPERM),
1300 SCMP_SYS(syslog),
1301 0);
1302
1303 if (r < 0) {
1304 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1305 continue;
1306 }
1307
1308 r = seccomp_load(seccomp);
1309 if (ERRNO_IS_SECCOMP_FATAL(r))
1310 return r;
1311 if (r < 0)
1312 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1313 }
1314
1315 return 0;
1316 }
1317
1318 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1319 uint32_t arch;
1320 int r;
1321
1322 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1323 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1324 bool supported;
1325 Iterator i;
1326
1327 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1328
1329 switch (arch) {
1330
1331 case SCMP_ARCH_X86_64:
1332 case SCMP_ARCH_X32:
1333 case SCMP_ARCH_ARM:
1334 case SCMP_ARCH_AARCH64:
1335 case SCMP_ARCH_PPC:
1336 case SCMP_ARCH_PPC64:
1337 case SCMP_ARCH_PPC64LE:
1338 case SCMP_ARCH_MIPSEL64N32:
1339 case SCMP_ARCH_MIPS64N32:
1340 case SCMP_ARCH_MIPSEL64:
1341 case SCMP_ARCH_MIPS64:
1342 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1343 supported = true;
1344 break;
1345
1346 case SCMP_ARCH_S390:
1347 case SCMP_ARCH_S390X:
1348 case SCMP_ARCH_X86:
1349 case SCMP_ARCH_MIPSEL:
1350 case SCMP_ARCH_MIPS:
1351 default:
1352 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1353 * don't know */
1354 supported = false;
1355 break;
1356 }
1357
1358 if (!supported)
1359 continue;
1360
1361 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1362 if (r < 0)
1363 return r;
1364
1365 if (whitelist) {
1366 int af, first = 0, last = 0;
1367 void *afp;
1368
1369 /* If this is a whitelist, we first block the address families that are out of range and then
1370 * everything that is not in the set. First, we find the lowest and highest address family in
1371 * the set. */
1372
1373 SET_FOREACH(afp, address_families, i) {
1374 af = PTR_TO_INT(afp);
1375
1376 if (af <= 0 || af >= af_max())
1377 continue;
1378
1379 if (first == 0 || af < first)
1380 first = af;
1381
1382 if (last == 0 || af > last)
1383 last = af;
1384 }
1385
1386 assert((first == 0) == (last == 0));
1387
1388 if (first == 0) {
1389
1390 /* No entries in the valid range, block everything */
1391 r = seccomp_rule_add_exact(
1392 seccomp,
1393 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1394 SCMP_SYS(socket),
1395 0);
1396 if (r < 0) {
1397 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1398 continue;
1399 }
1400
1401 } else {
1402
1403 /* Block everything below the first entry */
1404 r = seccomp_rule_add_exact(
1405 seccomp,
1406 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1407 SCMP_SYS(socket),
1408 1,
1409 SCMP_A0(SCMP_CMP_LT, first));
1410 if (r < 0) {
1411 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1412 continue;
1413 }
1414
1415 /* Block everything above the last entry */
1416 r = seccomp_rule_add_exact(
1417 seccomp,
1418 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1419 SCMP_SYS(socket),
1420 1,
1421 SCMP_A0(SCMP_CMP_GT, last));
1422 if (r < 0) {
1423 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1424 continue;
1425 }
1426
1427 /* Block everything between the first and last entry */
1428 for (af = 1; af < af_max(); af++) {
1429
1430 if (set_contains(address_families, INT_TO_PTR(af)))
1431 continue;
1432
1433 r = seccomp_rule_add_exact(
1434 seccomp,
1435 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1436 SCMP_SYS(socket),
1437 1,
1438 SCMP_A0(SCMP_CMP_EQ, af));
1439 if (r < 0)
1440 break;
1441 }
1442 if (r < 0) {
1443 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1444 continue;
1445 }
1446 }
1447
1448 } else {
1449 void *af;
1450
1451 /* If this is a blacklist, then generate one rule for
1452 * each address family that are then combined in OR
1453 * checks. */
1454
1455 SET_FOREACH(af, address_families, i) {
1456
1457 r = seccomp_rule_add_exact(
1458 seccomp,
1459 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1460 SCMP_SYS(socket),
1461 1,
1462 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1463 if (r < 0)
1464 break;
1465 }
1466 if (r < 0) {
1467 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1468 continue;
1469 }
1470 }
1471
1472 r = seccomp_load(seccomp);
1473 if (ERRNO_IS_SECCOMP_FATAL(r))
1474 return r;
1475 if (r < 0)
1476 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1477 }
1478
1479 return 0;
1480 }
1481
1482 int seccomp_restrict_realtime(void) {
1483 static const int permitted_policies[] = {
1484 SCHED_OTHER,
1485 SCHED_BATCH,
1486 SCHED_IDLE,
1487 };
1488
1489 int r, max_policy = 0;
1490 uint32_t arch;
1491 unsigned i;
1492
1493 /* Determine the highest policy constant we want to allow */
1494 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1495 if (permitted_policies[i] > max_policy)
1496 max_policy = permitted_policies[i];
1497
1498 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1499 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1500 int p;
1501
1502 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1503
1504 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1505 if (r < 0)
1506 return r;
1507
1508 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1509 * whitelist. */
1510 for (p = 0; p < max_policy; p++) {
1511 bool good = false;
1512
1513 /* Check if this is in the whitelist. */
1514 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1515 if (permitted_policies[i] == p) {
1516 good = true;
1517 break;
1518 }
1519
1520 if (good)
1521 continue;
1522
1523 /* Deny this policy */
1524 r = seccomp_rule_add_exact(
1525 seccomp,
1526 SCMP_ACT_ERRNO(EPERM),
1527 SCMP_SYS(sched_setscheduler),
1528 1,
1529 SCMP_A1(SCMP_CMP_EQ, p));
1530 if (r < 0) {
1531 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1532 continue;
1533 }
1534 }
1535
1536 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1537 * unsigned here, hence no need no check for < 0 values. */
1538 r = seccomp_rule_add_exact(
1539 seccomp,
1540 SCMP_ACT_ERRNO(EPERM),
1541 SCMP_SYS(sched_setscheduler),
1542 1,
1543 SCMP_A1(SCMP_CMP_GT, max_policy));
1544 if (r < 0) {
1545 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1546 continue;
1547 }
1548
1549 r = seccomp_load(seccomp);
1550 if (ERRNO_IS_SECCOMP_FATAL(r))
1551 return r;
1552 if (r < 0)
1553 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1554 }
1555
1556 return 0;
1557 }
1558
1559 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1560 uint32_t arch,
1561 int nr,
1562 unsigned arg_cnt,
1563 const struct scmp_arg_cmp arg) {
1564 int r;
1565
1566 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1567 if (r < 0) {
1568 _cleanup_free_ char *n = NULL;
1569
1570 n = seccomp_syscall_resolve_num_arch(arch, nr);
1571 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1572 strna(n),
1573 seccomp_arch_to_string(arch));
1574 }
1575
1576 return r;
1577 }
1578
1579 /* For known architectures, check that syscalls are indeed defined or not. */
1580 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1581 assert_cc(SCMP_SYS(shmget) > 0);
1582 assert_cc(SCMP_SYS(shmat) > 0);
1583 assert_cc(SCMP_SYS(shmdt) > 0);
1584 #endif
1585
1586 int seccomp_memory_deny_write_execute(void) {
1587 uint32_t arch;
1588 unsigned loaded = 0;
1589
1590 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1591 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1592 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1593
1594 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1595
1596 switch (arch) {
1597
1598 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1599 * We ignore that here, which means there's still a way to get writable/executable
1600 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1601
1602 case SCMP_ARCH_X86:
1603 case SCMP_ARCH_S390:
1604 filter_syscall = SCMP_SYS(mmap2);
1605 block_syscall = SCMP_SYS(mmap);
1606 /* shmat multiplexed, see above */
1607 break;
1608
1609 case SCMP_ARCH_PPC:
1610 case SCMP_ARCH_PPC64:
1611 case SCMP_ARCH_PPC64LE:
1612 case SCMP_ARCH_S390X:
1613 filter_syscall = SCMP_SYS(mmap);
1614 /* shmat multiplexed, see above */
1615 break;
1616
1617 case SCMP_ARCH_ARM:
1618 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1619 shmat_syscall = SCMP_SYS(shmat);
1620 break;
1621
1622 case SCMP_ARCH_X86_64:
1623 case SCMP_ARCH_X32:
1624 case SCMP_ARCH_AARCH64:
1625 filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
1626 shmat_syscall = SCMP_SYS(shmat);
1627 break;
1628
1629 /* Please add more definitions here, if you port systemd to other architectures! */
1630
1631 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
1632 #warning "Consider adding the right mmap() syscall definitions here!"
1633 #endif
1634 }
1635
1636 /* Can't filter mmap() on this arch, then skip it */
1637 if (filter_syscall == 0)
1638 continue;
1639
1640 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1641 if (r < 0)
1642 return r;
1643
1644 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1645 1,
1646 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1647 if (r < 0)
1648 continue;
1649
1650 if (block_syscall != 0) {
1651 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1652 if (r < 0)
1653 continue;
1654 }
1655
1656 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1657 1,
1658 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1659 if (r < 0)
1660 continue;
1661
1662 #ifdef __NR_pkey_mprotect
1663 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1664 1,
1665 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1666 if (r < 0)
1667 continue;
1668 #endif
1669
1670 if (shmat_syscall > 0) {
1671 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1672 1,
1673 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1674 if (r < 0)
1675 continue;
1676 }
1677
1678 r = seccomp_load(seccomp);
1679 if (ERRNO_IS_SECCOMP_FATAL(r))
1680 return r;
1681 if (r < 0)
1682 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1683 seccomp_arch_to_string(arch));
1684 loaded++;
1685 }
1686
1687 if (loaded == 0)
1688 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1689
1690 return loaded;
1691 }
1692
1693 int seccomp_restrict_archs(Set *archs) {
1694 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1695 Iterator i;
1696 void *id;
1697 int r;
1698
1699 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1700 * list.
1701 *
1702 * There are some qualifications. However the most important use is to stop processes from bypassing
1703 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1704 * in a non-native architecture. There are no holes in this use case, at least so far. */
1705
1706 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1707 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1708 * to run a program with the restrictions applied. */
1709 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1710 if (!seccomp)
1711 return -ENOMEM;
1712
1713 SET_FOREACH(id, archs, i) {
1714 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1715 if (r < 0 && r != -EEXIST)
1716 return r;
1717 }
1718
1719 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1720 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1721 * The important thing is that you can block the old 32-bit x86 syscalls.
1722 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1723
1724 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1725 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1726
1727 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1728 if (r < 0 && r != -EEXIST)
1729 return r;
1730 }
1731
1732 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1733 if (r < 0)
1734 return r;
1735
1736 r = seccomp_load(seccomp);
1737 if (ERRNO_IS_SECCOMP_FATAL(r))
1738 return r;
1739 if (r < 0)
1740 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1741
1742 return 0;
1743 }
1744
1745 int parse_syscall_archs(char **l, Set **archs) {
1746 _cleanup_set_free_ Set *_archs;
1747 char **s;
1748 int r;
1749
1750 assert(l);
1751 assert(archs);
1752
1753 r = set_ensure_allocated(&_archs, NULL);
1754 if (r < 0)
1755 return r;
1756
1757 STRV_FOREACH(s, l) {
1758 uint32_t a;
1759
1760 r = seccomp_arch_from_string(*s, &a);
1761 if (r < 0)
1762 return -EINVAL;
1763
1764 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1765 if (r < 0)
1766 return -ENOMEM;
1767 }
1768
1769 *archs = TAKE_PTR(_archs);
1770
1771 return 0;
1772 }
1773
1774 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1775 const char *i;
1776 int r;
1777
1778 assert(set);
1779
1780 NULSTR_FOREACH(i, set->value) {
1781
1782 if (i[0] == '@') {
1783 const SyscallFilterSet *more;
1784
1785 more = syscall_filter_set_find(i);
1786 if (!more)
1787 return -ENXIO;
1788
1789 r = seccomp_filter_set_add(filter, add, more);
1790 if (r < 0)
1791 return r;
1792 } else {
1793 int id;
1794
1795 id = seccomp_syscall_resolve_name(i);
1796 if (id == __NR_SCMP_ERROR) {
1797 log_debug("Couldn't resolve system call, ignoring: %s", i);
1798 continue;
1799 }
1800
1801 if (add) {
1802 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1803 if (r < 0)
1804 return r;
1805 } else
1806 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1807 }
1808 }
1809
1810 return 0;
1811 }
1812
1813 int seccomp_lock_personality(unsigned long personality) {
1814 uint32_t arch;
1815 int r;
1816
1817 if (personality >= PERSONALITY_INVALID)
1818 return -EINVAL;
1819
1820 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1821 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1822
1823 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1824 if (r < 0)
1825 return r;
1826
1827 r = seccomp_rule_add_exact(
1828 seccomp,
1829 SCMP_ACT_ERRNO(EPERM),
1830 SCMP_SYS(personality),
1831 1,
1832 SCMP_A0(SCMP_CMP_NE, personality));
1833 if (r < 0) {
1834 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1835 continue;
1836 }
1837
1838 r = seccomp_load(seccomp);
1839 if (ERRNO_IS_SECCOMP_FATAL(r))
1840 return r;
1841 if (r < 0)
1842 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1843 }
1844
1845 return 0;
1846 }
1847
1848 int seccomp_protect_hostname(void) {
1849 uint32_t arch;
1850 int r;
1851
1852 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1853 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1854
1855 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1856 if (r < 0)
1857 return r;
1858
1859 r = seccomp_rule_add_exact(
1860 seccomp,
1861 SCMP_ACT_ERRNO(EPERM),
1862 SCMP_SYS(sethostname),
1863 0);
1864 if (r < 0) {
1865 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1866 continue;
1867 }
1868
1869 r = seccomp_rule_add_exact(
1870 seccomp,
1871 SCMP_ACT_ERRNO(EPERM),
1872 SCMP_SYS(setdomainname),
1873 0);
1874 if (r < 0) {
1875 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1876 continue;
1877 }
1878
1879 r = seccomp_load(seccomp);
1880 if (ERRNO_IS_SECCOMP_FATAL(r))
1881 return r;
1882 if (r < 0)
1883 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1884 }
1885
1886 return 0;
1887 }
1888
1889 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1890 /* Checks the mode_t parameter of the following system calls:
1891 *
1892 * → chmod() + fchmod() + fchmodat()
1893 * → open() + creat() + openat()
1894 * → mkdir() + mkdirat()
1895 * → mknod() + mknodat()
1896 *
1897 * Returns error if *everything* failed, and 0 otherwise.
1898 */
1899 int r = 0;
1900 bool any = false;
1901
1902 r = seccomp_rule_add_exact(
1903 seccomp,
1904 SCMP_ACT_ERRNO(EPERM),
1905 SCMP_SYS(chmod),
1906 1,
1907 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1908 if (r < 0)
1909 log_debug_errno(r, "Failed to add filter for chmod: %m");
1910 else
1911 any = true;
1912
1913 r = seccomp_rule_add_exact(
1914 seccomp,
1915 SCMP_ACT_ERRNO(EPERM),
1916 SCMP_SYS(fchmod),
1917 1,
1918 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1919 if (r < 0)
1920 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1921 else
1922 any = true;
1923
1924 r = seccomp_rule_add_exact(
1925 seccomp,
1926 SCMP_ACT_ERRNO(EPERM),
1927 SCMP_SYS(fchmodat),
1928 1,
1929 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1930 if (r < 0)
1931 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1932 else
1933 any = true;
1934
1935 r = seccomp_rule_add_exact(
1936 seccomp,
1937 SCMP_ACT_ERRNO(EPERM),
1938 SCMP_SYS(mkdir),
1939 1,
1940 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1941 if (r < 0)
1942 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1943 else
1944 any = true;
1945
1946 r = seccomp_rule_add_exact(
1947 seccomp,
1948 SCMP_ACT_ERRNO(EPERM),
1949 SCMP_SYS(mkdirat),
1950 1,
1951 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1952 if (r < 0)
1953 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1954 else
1955 any = true;
1956
1957 r = seccomp_rule_add_exact(
1958 seccomp,
1959 SCMP_ACT_ERRNO(EPERM),
1960 SCMP_SYS(mknod),
1961 1,
1962 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1963 if (r < 0)
1964 log_debug_errno(r, "Failed to add filter for mknod: %m");
1965 else
1966 any = true;
1967
1968 r = seccomp_rule_add_exact(
1969 seccomp,
1970 SCMP_ACT_ERRNO(EPERM),
1971 SCMP_SYS(mknodat),
1972 1,
1973 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1974 if (r < 0)
1975 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1976 else
1977 any = true;
1978
1979 #if SCMP_SYS(open) > 0
1980 r = seccomp_rule_add_exact(
1981 seccomp,
1982 SCMP_ACT_ERRNO(EPERM),
1983 SCMP_SYS(open),
1984 2,
1985 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1986 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1987 if (r < 0)
1988 log_debug_errno(r, "Failed to add filter for open: %m");
1989 else
1990 any = true;
1991 #endif
1992
1993 r = seccomp_rule_add_exact(
1994 seccomp,
1995 SCMP_ACT_ERRNO(EPERM),
1996 SCMP_SYS(openat),
1997 2,
1998 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1999 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2000 if (r < 0)
2001 log_debug_errno(r, "Failed to add filter for openat: %m");
2002 else
2003 any = true;
2004
2005 r = seccomp_rule_add_exact(
2006 seccomp,
2007 SCMP_ACT_ERRNO(EPERM),
2008 SCMP_SYS(creat),
2009 1,
2010 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2011 if (r < 0)
2012 log_debug_errno(r, "Failed to add filter for creat: %m");
2013 else
2014 any = true;
2015
2016 return any ? 0 : r;
2017 }
2018
2019 int seccomp_restrict_suid_sgid(void) {
2020 uint32_t arch;
2021 int r, k;
2022
2023 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2024 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2025
2026 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2027 if (r < 0)
2028 return r;
2029
2030 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2031 if (r < 0)
2032 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2033
2034 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2035 if (k < 0)
2036 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2037
2038 if (r < 0 && k < 0)
2039 continue;
2040
2041 r = seccomp_load(seccomp);
2042 if (ERRNO_IS_SECCOMP_FATAL(r))
2043 return r;
2044 if (r < 0)
2045 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2046 }
2047
2048 return 0;
2049 }
2050
2051 uint32_t scmp_act_kill_process(void) {
2052
2053 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2054 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2055 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2056 * for single-threaded apps does the right thing. */
2057
2058 #ifdef SCMP_ACT_KILL_PROCESS
2059 if (seccomp_api_get() >= 3)
2060 return SCMP_ACT_KILL_PROCESS;
2061 #endif
2062
2063 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2064 }