]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
13c83097a55cfe300e9aa6ff7a299900ca2a1dad
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2014 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <linux/seccomp.h>
23 #include <seccomp.h>
24 #include <stddef.h>
25 #include <sys/mman.h>
26 #include <sys/prctl.h>
27 #include <sys/shm.h>
28
29 #include "af-list.h"
30 #include "alloc-util.h"
31 #include "macro.h"
32 #include "nsflags.h"
33 #include "process-util.h"
34 #include "seccomp-util.h"
35 #include "set.h"
36 #include "string-util.h"
37 #include "strv.h"
38 #include "util.h"
39 #include "errno-list.h"
40
41 const uint32_t seccomp_local_archs[] = {
42
43 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
44
45 #if defined(__x86_64__) && defined(__ILP32__)
46 SCMP_ARCH_X86,
47 SCMP_ARCH_X86_64,
48 SCMP_ARCH_X32, /* native */
49 #elif defined(__x86_64__) && !defined(__ILP32__)
50 SCMP_ARCH_X86,
51 SCMP_ARCH_X32,
52 SCMP_ARCH_X86_64, /* native */
53 #elif defined(__i386__)
54 SCMP_ARCH_X86,
55 #elif defined(__aarch64__)
56 SCMP_ARCH_ARM,
57 SCMP_ARCH_AARCH64, /* native */
58 #elif defined(__arm__)
59 SCMP_ARCH_ARM,
60 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
61 SCMP_ARCH_MIPSEL,
62 SCMP_ARCH_MIPS, /* native */
63 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL, /* native */
66 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
67 SCMP_ARCH_MIPSEL,
68 SCMP_ARCH_MIPS,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64N32,
71 SCMP_ARCH_MIPSEL64,
72 SCMP_ARCH_MIPS64, /* native */
73 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
74 SCMP_ARCH_MIPS,
75 SCMP_ARCH_MIPSEL,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32,
78 SCMP_ARCH_MIPS64,
79 SCMP_ARCH_MIPSEL64, /* native */
80 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
81 SCMP_ARCH_MIPSEL,
82 SCMP_ARCH_MIPS,
83 SCMP_ARCH_MIPSEL64,
84 SCMP_ARCH_MIPS64,
85 SCMP_ARCH_MIPSEL64N32,
86 SCMP_ARCH_MIPS64N32, /* native */
87 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
88 SCMP_ARCH_MIPS,
89 SCMP_ARCH_MIPSEL,
90 SCMP_ARCH_MIPS64,
91 SCMP_ARCH_MIPSEL64,
92 SCMP_ARCH_MIPS64N32,
93 SCMP_ARCH_MIPSEL64N32, /* native */
94 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
95 SCMP_ARCH_PPC,
96 SCMP_ARCH_PPC64LE,
97 SCMP_ARCH_PPC64, /* native */
98 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
99 SCMP_ARCH_PPC,
100 SCMP_ARCH_PPC64,
101 SCMP_ARCH_PPC64LE, /* native */
102 #elif defined(__powerpc__)
103 SCMP_ARCH_PPC,
104 #elif defined(__s390x__)
105 SCMP_ARCH_S390,
106 SCMP_ARCH_S390X, /* native */
107 #elif defined(__s390__)
108 SCMP_ARCH_S390,
109 #endif
110 (uint32_t) -1
111 };
112
113 const char* seccomp_arch_to_string(uint32_t c) {
114 /* Maintain order used in <seccomp.h>.
115 *
116 * Names used here should be the same as those used for ConditionArchitecture=,
117 * except for "subarchitectures" like x32. */
118
119 switch(c) {
120 case SCMP_ARCH_NATIVE:
121 return "native";
122 case SCMP_ARCH_X86:
123 return "x86";
124 case SCMP_ARCH_X86_64:
125 return "x86-64";
126 case SCMP_ARCH_X32:
127 return "x32";
128 case SCMP_ARCH_ARM:
129 return "arm";
130 case SCMP_ARCH_AARCH64:
131 return "arm64";
132 case SCMP_ARCH_MIPS:
133 return "mips";
134 case SCMP_ARCH_MIPS64:
135 return "mips64";
136 case SCMP_ARCH_MIPS64N32:
137 return "mips64-n32";
138 case SCMP_ARCH_MIPSEL:
139 return "mips-le";
140 case SCMP_ARCH_MIPSEL64:
141 return "mips64-le";
142 case SCMP_ARCH_MIPSEL64N32:
143 return "mips64-le-n32";
144 case SCMP_ARCH_PPC:
145 return "ppc";
146 case SCMP_ARCH_PPC64:
147 return "ppc64";
148 case SCMP_ARCH_PPC64LE:
149 return "ppc64-le";
150 case SCMP_ARCH_S390:
151 return "s390";
152 case SCMP_ARCH_S390X:
153 return "s390x";
154 default:
155 return NULL;
156 }
157 }
158
159 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
160 if (!n)
161 return -EINVAL;
162
163 assert(ret);
164
165 if (streq(n, "native"))
166 *ret = SCMP_ARCH_NATIVE;
167 else if (streq(n, "x86"))
168 *ret = SCMP_ARCH_X86;
169 else if (streq(n, "x86-64"))
170 *ret = SCMP_ARCH_X86_64;
171 else if (streq(n, "x32"))
172 *ret = SCMP_ARCH_X32;
173 else if (streq(n, "arm"))
174 *ret = SCMP_ARCH_ARM;
175 else if (streq(n, "arm64"))
176 *ret = SCMP_ARCH_AARCH64;
177 else if (streq(n, "mips"))
178 *ret = SCMP_ARCH_MIPS;
179 else if (streq(n, "mips64"))
180 *ret = SCMP_ARCH_MIPS64;
181 else if (streq(n, "mips64-n32"))
182 *ret = SCMP_ARCH_MIPS64N32;
183 else if (streq(n, "mips-le"))
184 *ret = SCMP_ARCH_MIPSEL;
185 else if (streq(n, "mips64-le"))
186 *ret = SCMP_ARCH_MIPSEL64;
187 else if (streq(n, "mips64-le-n32"))
188 *ret = SCMP_ARCH_MIPSEL64N32;
189 else if (streq(n, "ppc"))
190 *ret = SCMP_ARCH_PPC;
191 else if (streq(n, "ppc64"))
192 *ret = SCMP_ARCH_PPC64;
193 else if (streq(n, "ppc64-le"))
194 *ret = SCMP_ARCH_PPC64LE;
195 else if (streq(n, "s390"))
196 *ret = SCMP_ARCH_S390;
197 else if (streq(n, "s390x"))
198 *ret = SCMP_ARCH_S390X;
199 else
200 return -EINVAL;
201
202 return 0;
203 }
204
205 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
206 scmp_filter_ctx seccomp;
207 int r;
208
209 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
210 * any others. Also, turns off the NNP fiddling. */
211
212 seccomp = seccomp_init(default_action);
213 if (!seccomp)
214 return -ENOMEM;
215
216 if (arch != SCMP_ARCH_NATIVE &&
217 arch != seccomp_arch_native()) {
218
219 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_arch_add(seccomp, arch);
224 if (r < 0)
225 goto finish;
226
227 assert(seccomp_arch_exist(seccomp, arch) >= 0);
228 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
229 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
230 } else {
231 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
232 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
233 }
234
235 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
236 if (r < 0)
237 goto finish;
238
239 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
240 if (r < 0)
241 goto finish;
242
243 *ret = seccomp;
244 return 0;
245
246 finish:
247 seccomp_release(seccomp);
248 return r;
249 }
250
251 static bool is_basic_seccomp_available(void) {
252 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
253 }
254
255 static bool is_seccomp_filter_available(void) {
256 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
257 errno == EFAULT;
258 }
259
260 bool is_seccomp_available(void) {
261 static int cached_enabled = -1;
262
263 if (cached_enabled < 0)
264 cached_enabled =
265 is_basic_seccomp_available() &&
266 is_seccomp_filter_available();
267
268 return cached_enabled;
269 }
270
271 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
272 [SYSCALL_FILTER_SET_DEFAULT] = {
273 .name = "@default",
274 .help = "System calls that are always permitted",
275 .value =
276 "clock_getres\0"
277 "clock_gettime\0"
278 "clock_nanosleep\0"
279 "execve\0"
280 "exit\0"
281 "exit_group\0"
282 "futex\0"
283 "get_robust_list\0"
284 "get_thread_area\0"
285 "getegid\0"
286 "getegid32\0"
287 "geteuid\0"
288 "geteuid32\0"
289 "getgid\0"
290 "getgid32\0"
291 "getgroups\0"
292 "getgroups32\0"
293 "getpgid\0"
294 "getpgrp\0"
295 "getpid\0"
296 "getppid\0"
297 "getresgid\0"
298 "getresgid32\0"
299 "getresuid\0"
300 "getresuid32\0"
301 "getrlimit\0" /* make sure processes can query stack size and such */
302 "getsid\0"
303 "gettid\0"
304 "gettimeofday\0"
305 "getuid\0"
306 "getuid32\0"
307 "membarrier\0"
308 "nanosleep\0"
309 "pause\0"
310 "prlimit64\0"
311 "restart_syscall\0"
312 "rt_sigreturn\0"
313 "sched_yield\0"
314 "set_robust_list\0"
315 "set_thread_area\0"
316 "set_tid_address\0"
317 "set_tls\0"
318 "sigreturn\0"
319 "time\0"
320 "ugetrlimit\0"
321 },
322 [SYSCALL_FILTER_SET_AIO] = {
323 .name = "@aio",
324 .help = "Asynchronous IO",
325 .value =
326 "io_cancel\0"
327 "io_destroy\0"
328 "io_getevents\0"
329 "io_setup\0"
330 "io_submit\0"
331 },
332 [SYSCALL_FILTER_SET_BASIC_IO] = {
333 .name = "@basic-io",
334 .help = "Basic IO",
335 .value =
336 "_llseek\0"
337 "close\0"
338 "dup\0"
339 "dup2\0"
340 "dup3\0"
341 "lseek\0"
342 "pread64\0"
343 "preadv\0"
344 "preadv2\0"
345 "pwrite64\0"
346 "pwritev\0"
347 "pwritev2\0"
348 "read\0"
349 "readv\0"
350 "write\0"
351 "writev\0"
352 },
353 [SYSCALL_FILTER_SET_CHOWN] = {
354 .name = "@chown",
355 .help = "Change ownership of files and directories",
356 .value =
357 "chown\0"
358 "chown32\0"
359 "fchown\0"
360 "fchown32\0"
361 "fchownat\0"
362 "lchown\0"
363 "lchown32\0"
364 },
365 [SYSCALL_FILTER_SET_CLOCK] = {
366 .name = "@clock",
367 .help = "Change the system time",
368 .value =
369 "adjtimex\0"
370 "clock_adjtime\0"
371 "clock_settime\0"
372 "settimeofday\0"
373 "stime\0"
374 },
375 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
376 .name = "@cpu-emulation",
377 .help = "System calls for CPU emulation functionality",
378 .value =
379 "modify_ldt\0"
380 "subpage_prot\0"
381 "switch_endian\0"
382 "vm86\0"
383 "vm86old\0"
384 },
385 [SYSCALL_FILTER_SET_DEBUG] = {
386 .name = "@debug",
387 .help = "Debugging, performance monitoring and tracing functionality",
388 .value =
389 "lookup_dcookie\0"
390 "perf_event_open\0"
391 "process_vm_readv\0"
392 "process_vm_writev\0"
393 "ptrace\0"
394 "rtas\0"
395 #ifdef __NR_s390_runtime_instr
396 "s390_runtime_instr\0"
397 #endif
398 "sys_debug_setcontext\0"
399 },
400 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
401 .name = "@file-system",
402 .help = "File system operations",
403 .value =
404 "access\0"
405 "chdir\0"
406 "chmod\0"
407 "close\0"
408 "creat\0"
409 "faccessat\0"
410 "fallocate\0"
411 "fchdir\0"
412 "fchmod\0"
413 "fchmodat\0"
414 "fcntl\0"
415 "fcntl64\0"
416 "fgetxattr\0"
417 "flistxattr\0"
418 "fremovexattr\0"
419 "fsetxattr\0"
420 "fstat\0"
421 "fstat64\0"
422 "fstatat64\0"
423 "fstatfs\0"
424 "fstatfs64\0"
425 "ftruncate\0"
426 "ftruncate64\0"
427 "futimesat\0"
428 "getcwd\0"
429 "getdents\0"
430 "getdents64\0"
431 "getxattr\0"
432 "inotify_add_watch\0"
433 "inotify_init\0"
434 "inotify_init1\0"
435 "inotify_rm_watch\0"
436 "lgetxattr\0"
437 "link\0"
438 "linkat\0"
439 "listxattr\0"
440 "llistxattr\0"
441 "lremovexattr\0"
442 "lsetxattr\0"
443 "lstat\0"
444 "lstat64\0"
445 "mkdir\0"
446 "mkdirat\0"
447 "mknod\0"
448 "mknodat\0"
449 "mmap\0"
450 "mmap2\0"
451 "munmap\0"
452 "newfstatat\0"
453 "oldfstat\0"
454 "oldlstat\0"
455 "oldstat\0"
456 "open\0"
457 "openat\0"
458 "readlink\0"
459 "readlinkat\0"
460 "removexattr\0"
461 "rename\0"
462 "renameat\0"
463 "renameat2\0"
464 "rmdir\0"
465 "setxattr\0"
466 "stat\0"
467 "stat64\0"
468 "statfs\0"
469 "statfs64\0"
470 #ifdef __NR_statx
471 "statx\0"
472 #endif
473 "symlink\0"
474 "symlinkat\0"
475 "truncate\0"
476 "truncate64\0"
477 "unlink\0"
478 "unlinkat\0"
479 "utime\0"
480 "utimensat\0"
481 "utimes\0"
482 },
483 [SYSCALL_FILTER_SET_IO_EVENT] = {
484 .name = "@io-event",
485 .help = "Event loop system calls",
486 .value =
487 "_newselect\0"
488 "epoll_create\0"
489 "epoll_create1\0"
490 "epoll_ctl\0"
491 "epoll_ctl_old\0"
492 "epoll_pwait\0"
493 "epoll_wait\0"
494 "epoll_wait_old\0"
495 "eventfd\0"
496 "eventfd2\0"
497 "poll\0"
498 "ppoll\0"
499 "pselect6\0"
500 "select\0"
501 },
502 [SYSCALL_FILTER_SET_IPC] = {
503 .name = "@ipc",
504 .help = "SysV IPC, POSIX Message Queues or other IPC",
505 .value =
506 "ipc\0"
507 "memfd_create\0"
508 "mq_getsetattr\0"
509 "mq_notify\0"
510 "mq_open\0"
511 "mq_timedreceive\0"
512 "mq_timedsend\0"
513 "mq_unlink\0"
514 "msgctl\0"
515 "msgget\0"
516 "msgrcv\0"
517 "msgsnd\0"
518 "pipe\0"
519 "pipe2\0"
520 "process_vm_readv\0"
521 "process_vm_writev\0"
522 "semctl\0"
523 "semget\0"
524 "semop\0"
525 "semtimedop\0"
526 "shmat\0"
527 "shmctl\0"
528 "shmdt\0"
529 "shmget\0"
530 },
531 [SYSCALL_FILTER_SET_KEYRING] = {
532 .name = "@keyring",
533 .help = "Kernel keyring access",
534 .value =
535 "add_key\0"
536 "keyctl\0"
537 "request_key\0"
538 },
539 [SYSCALL_FILTER_SET_MEMLOCK] = {
540 .name = "@memlock",
541 .help = "Memory locking control",
542 .value =
543 "mlock\0"
544 "mlock2\0"
545 "mlockall\0"
546 "munlock\0"
547 "munlockall\0"
548 },
549 [SYSCALL_FILTER_SET_MODULE] = {
550 .name = "@module",
551 .help = "Loading and unloading of kernel modules",
552 .value =
553 "delete_module\0"
554 "finit_module\0"
555 "init_module\0"
556 },
557 [SYSCALL_FILTER_SET_MOUNT] = {
558 .name = "@mount",
559 .help = "Mounting and unmounting of file systems",
560 .value =
561 "chroot\0"
562 "mount\0"
563 "pivot_root\0"
564 "umount\0"
565 "umount2\0"
566 },
567 [SYSCALL_FILTER_SET_NETWORK_IO] = {
568 .name = "@network-io",
569 .help = "Network or Unix socket IO, should not be needed if not network facing",
570 .value =
571 "accept\0"
572 "accept4\0"
573 "bind\0"
574 "connect\0"
575 "getpeername\0"
576 "getsockname\0"
577 "getsockopt\0"
578 "listen\0"
579 "recv\0"
580 "recvfrom\0"
581 "recvmmsg\0"
582 "recvmsg\0"
583 "send\0"
584 "sendmmsg\0"
585 "sendmsg\0"
586 "sendto\0"
587 "setsockopt\0"
588 "shutdown\0"
589 "socket\0"
590 "socketcall\0"
591 "socketpair\0"
592 },
593 [SYSCALL_FILTER_SET_OBSOLETE] = {
594 /* some unknown even to libseccomp */
595 .name = "@obsolete",
596 .help = "Unusual, obsolete or unimplemented system calls",
597 .value =
598 "_sysctl\0"
599 "afs_syscall\0"
600 "bdflush\0"
601 "break\0"
602 "create_module\0"
603 "ftime\0"
604 "get_kernel_syms\0"
605 "getpmsg\0"
606 "gtty\0"
607 "idle\0"
608 "lock\0"
609 "mpx\0"
610 "prof\0"
611 "profil\0"
612 "putpmsg\0"
613 "query_module\0"
614 "security\0"
615 "sgetmask\0"
616 "ssetmask\0"
617 "stty\0"
618 "sysfs\0"
619 "tuxcall\0"
620 "ulimit\0"
621 "uselib\0"
622 "ustat\0"
623 "vserver\0"
624 },
625 [SYSCALL_FILTER_SET_PRIVILEGED] = {
626 .name = "@privileged",
627 .help = "All system calls which need super-user capabilities",
628 .value =
629 "@chown\0"
630 "@clock\0"
631 "@module\0"
632 "@raw-io\0"
633 "@reboot\0"
634 "@swap\0"
635 "_sysctl\0"
636 "acct\0"
637 "bpf\0"
638 "capset\0"
639 "chroot\0"
640 "nfsservctl\0"
641 "pivot_root\0"
642 "quotactl\0"
643 "setdomainname\0"
644 "setfsuid\0"
645 "setfsuid32\0"
646 "setgroups\0"
647 "setgroups32\0"
648 "sethostname\0"
649 "setresuid\0"
650 "setresuid32\0"
651 "setreuid\0"
652 "setreuid32\0"
653 "setuid\0"
654 "setuid32\0"
655 "vhangup\0"
656 },
657 [SYSCALL_FILTER_SET_PROCESS] = {
658 .name = "@process",
659 .help = "Process control, execution, namespaceing operations",
660 .value =
661 "arch_prctl\0"
662 "capget\0" /* Able to query arbitrary processes */
663 "clone\0"
664 "execveat\0"
665 "fork\0"
666 "getrusage\0"
667 "kill\0"
668 "prctl\0"
669 "rt_sigqueueinfo\0"
670 "rt_tgsigqueueinfo\0"
671 "setns\0"
672 "tgkill\0"
673 "times\0"
674 "tkill\0"
675 "unshare\0"
676 "vfork\0"
677 "wait4\0"
678 "waitid\0"
679 "waitpid\0"
680 },
681 [SYSCALL_FILTER_SET_RAW_IO] = {
682 .name = "@raw-io",
683 .help = "Raw I/O port access",
684 .value =
685 "ioperm\0"
686 "iopl\0"
687 "pciconfig_iobase\0"
688 "pciconfig_read\0"
689 "pciconfig_write\0"
690 #ifdef __NR_s390_pci_mmio_read
691 "s390_pci_mmio_read\0"
692 #endif
693 #ifdef __NR_s390_pci_mmio_write
694 "s390_pci_mmio_write\0"
695 #endif
696 },
697 [SYSCALL_FILTER_SET_REBOOT] = {
698 .name = "@reboot",
699 .help = "Reboot and reboot preparation/kexec",
700 .value =
701 "kexec_file_load\0"
702 "kexec_load\0"
703 "reboot\0"
704 },
705 [SYSCALL_FILTER_SET_RESOURCES] = {
706 .name = "@resources",
707 .help = "Alter resource settings",
708 .value =
709 "ioprio_set\0"
710 "mbind\0"
711 "migrate_pages\0"
712 "move_pages\0"
713 "nice\0"
714 "sched_setaffinity\0"
715 "sched_setattr\0"
716 "sched_setparam\0"
717 "sched_setscheduler\0"
718 "set_mempolicy\0"
719 "setpriority\0"
720 "setrlimit\0"
721 },
722 [SYSCALL_FILTER_SET_SETUID] = {
723 .name = "@setuid",
724 .help = "Operations for changing user/group credentials",
725 .value =
726 "setgid\0"
727 "setgid32\0"
728 "setgroups\0"
729 "setgroups32\0"
730 "setregid\0"
731 "setregid32\0"
732 "setresgid\0"
733 "setresgid32\0"
734 "setresuid\0"
735 "setresuid32\0"
736 "setreuid\0"
737 "setreuid32\0"
738 "setuid\0"
739 "setuid32\0"
740 },
741 [SYSCALL_FILTER_SET_SIGNAL] = {
742 .name = "@signal",
743 .help = "Process signal handling",
744 .value =
745 "rt_sigaction\0"
746 "rt_sigpending\0"
747 "rt_sigprocmask\0"
748 "rt_sigsuspend\0"
749 "rt_sigtimedwait\0"
750 "sigaction\0"
751 "sigaltstack\0"
752 "signal\0"
753 "signalfd\0"
754 "signalfd4\0"
755 "sigpending\0"
756 "sigprocmask\0"
757 "sigsuspend\0"
758 },
759 [SYSCALL_FILTER_SET_SWAP] = {
760 .name = "@swap",
761 .help = "Enable/disable swap devices",
762 .value =
763 "swapoff\0"
764 "swapon\0"
765 },
766 [SYSCALL_FILTER_SET_SYNC] = {
767 .name = "@sync",
768 .help = "Synchronize files and memory to storage",
769 .value =
770 "fdatasync\0"
771 "fsync\0"
772 "msync\0"
773 "sync\0"
774 "sync_file_range\0"
775 "syncfs\0"
776 },
777 [SYSCALL_FILTER_SET_TIMER] = {
778 .name = "@timer",
779 .help = "Schedule operations by time",
780 .value =
781 "alarm\0"
782 "getitimer\0"
783 "setitimer\0"
784 "timer_create\0"
785 "timer_delete\0"
786 "timer_getoverrun\0"
787 "timer_gettime\0"
788 "timer_settime\0"
789 "timerfd_create\0"
790 "timerfd_gettime\0"
791 "timerfd_settime\0"
792 "times\0"
793 },
794 };
795
796 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
797 unsigned i;
798
799 if (isempty(name) || name[0] != '@')
800 return NULL;
801
802 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
803 if (streq(syscall_filter_sets[i].name, name))
804 return syscall_filter_sets + i;
805
806 return NULL;
807 }
808
809 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
810
811 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
812 int r;
813
814 assert(seccomp);
815 assert(name);
816
817 if (strv_contains(exclude, name))
818 return 0;
819
820 if (name[0] == '@') {
821 const SyscallFilterSet *other;
822
823 other = syscall_filter_set_find(name);
824 if (!other) {
825 log_debug("Filter set %s is not known!", name);
826 return -EINVAL;
827 }
828
829 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
830 if (r < 0)
831 return r;
832 } else {
833 int id;
834
835 id = seccomp_syscall_resolve_name(name);
836 if (id == __NR_SCMP_ERROR) {
837 log_debug("System call %s is not known, ignoring.", name);
838 return 0;
839 }
840
841 r = seccomp_rule_add_exact(seccomp, action, id, 0);
842 if (r < 0)
843 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
844 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
845 }
846
847 return 0;
848 }
849
850 static int seccomp_add_syscall_filter_set(
851 scmp_filter_ctx seccomp,
852 const SyscallFilterSet *set,
853 uint32_t action,
854 char **exclude) {
855
856 const char *sys;
857 int r;
858
859 assert(seccomp);
860 assert(set);
861
862 NULSTR_FOREACH(sys, set->value) {
863 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
864 if (r < 0)
865 return r;
866 }
867
868 return 0;
869 }
870
871 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
872 uint32_t arch;
873 int r;
874
875 assert(set);
876
877 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
878 * earch local arch. */
879
880 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
881 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
882
883 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
884
885 r = seccomp_init_for_arch(&seccomp, arch, default_action);
886 if (r < 0)
887 return r;
888
889 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
890 if (r < 0) {
891 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
892 continue;
893 }
894
895 r = seccomp_load(seccomp);
896 if (IN_SET(r, -EPERM, -EACCES))
897 return r;
898 if (r < 0)
899 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
900 }
901
902 return 0;
903 }
904
905 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action) {
906 uint32_t arch;
907 int r;
908
909 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
910 * SyscallFilterSet* table. */
911
912 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
913 return 0;
914
915 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
916 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
917 Iterator i;
918 void *id, *val;
919
920 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
921
922 r = seccomp_init_for_arch(&seccomp, arch, default_action);
923 if (r < 0)
924 return r;
925
926 HASHMAP_FOREACH_KEY(val, id, set, i) {
927 uint32_t a = action;
928 int e = PTR_TO_INT(val);
929
930 if (action != SCMP_ACT_ALLOW && e >= 0)
931 a = SCMP_ACT_ERRNO(e);
932
933 r = seccomp_rule_add_exact(seccomp, a, PTR_TO_INT(id) - 1, 0);
934 if (r < 0) {
935 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
936 _cleanup_free_ char *n = NULL;
937
938 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
939 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
940 }
941 }
942
943 r = seccomp_load(seccomp);
944 if (IN_SET(r, -EPERM, -EACCES))
945 return r;
946 if (r < 0)
947 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
948 }
949
950 return 0;
951 }
952
953 int seccomp_parse_syscall_filter_full(
954 const char *name,
955 int errno_num,
956 Hashmap *filter,
957 SeccompParseFlags flags,
958 const char *unit,
959 const char *filename,
960 unsigned line) {
961
962 int r;
963
964 assert(name);
965 assert(filter);
966
967 if (name[0] == '@') {
968 const SyscallFilterSet *set;
969 const char *i;
970
971 set = syscall_filter_set_find(name);
972 if (!set) {
973 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
974 return -EINVAL;
975
976 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
977 "Unknown system call group, ignoring: %s", name);
978 return 0;
979 }
980
981 NULSTR_FOREACH(i, set->value) {
982 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
983 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
984 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
985 * about them. */
986 r = seccomp_parse_syscall_filter_full(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
987 if (r < 0)
988 return r;
989 }
990 } else {
991 int id;
992
993 id = seccomp_syscall_resolve_name(name);
994 if (id == __NR_SCMP_ERROR) {
995 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
996 return -EINVAL;
997
998 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
999 "Failed to parse system call, ignoring: %s", name);
1000 return 0;
1001 }
1002
1003 /* If we previously wanted to forbid a syscall and now
1004 * we want to allow it, then remove it from the list. */
1005 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1006 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1007 if (r < 0)
1008 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1009 } else
1010 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1011 }
1012
1013 return 0;
1014 }
1015
1016 int seccomp_restrict_namespaces(unsigned long retain) {
1017 uint32_t arch;
1018 int r;
1019
1020 if (DEBUG_LOGGING) {
1021 _cleanup_free_ char *s = NULL;
1022
1023 (void) namespace_flag_to_string_many(retain, &s);
1024 log_debug("Restricting namespace to: %s.", strna(s));
1025 }
1026
1027 /* NOOP? */
1028 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1029 return 0;
1030
1031 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1032 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1033 unsigned i;
1034
1035 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1036
1037 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1038 if (r < 0)
1039 return r;
1040
1041 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1042 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1043 * altogether. */
1044 r = seccomp_rule_add_exact(
1045 seccomp,
1046 SCMP_ACT_ERRNO(EPERM),
1047 SCMP_SYS(setns),
1048 0);
1049 else
1050 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1051 * special invocation with a zero flags argument, right here. */
1052 r = seccomp_rule_add_exact(
1053 seccomp,
1054 SCMP_ACT_ERRNO(EPERM),
1055 SCMP_SYS(setns),
1056 1,
1057 SCMP_A1(SCMP_CMP_EQ, 0));
1058 if (r < 0) {
1059 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1060 continue;
1061 }
1062
1063 for (i = 0; namespace_flag_map[i].name; i++) {
1064 unsigned long f;
1065
1066 f = namespace_flag_map[i].flag;
1067 if ((retain & f) == f) {
1068 log_debug("Permitting %s.", namespace_flag_map[i].name);
1069 continue;
1070 }
1071
1072 log_debug("Blocking %s.", namespace_flag_map[i].name);
1073
1074 r = seccomp_rule_add_exact(
1075 seccomp,
1076 SCMP_ACT_ERRNO(EPERM),
1077 SCMP_SYS(unshare),
1078 1,
1079 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1080 if (r < 0) {
1081 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1082 break;
1083 }
1084
1085 /* On s390/s390x the first two parameters to clone are switched */
1086 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1087 r = seccomp_rule_add_exact(
1088 seccomp,
1089 SCMP_ACT_ERRNO(EPERM),
1090 SCMP_SYS(clone),
1091 1,
1092 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1093 else
1094 r = seccomp_rule_add_exact(
1095 seccomp,
1096 SCMP_ACT_ERRNO(EPERM),
1097 SCMP_SYS(clone),
1098 1,
1099 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1100 if (r < 0) {
1101 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1102 break;
1103 }
1104
1105 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1106 r = seccomp_rule_add_exact(
1107 seccomp,
1108 SCMP_ACT_ERRNO(EPERM),
1109 SCMP_SYS(setns),
1110 1,
1111 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1112 if (r < 0) {
1113 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1114 break;
1115 }
1116 }
1117 }
1118 if (r < 0)
1119 continue;
1120
1121 r = seccomp_load(seccomp);
1122 if (IN_SET(r, -EPERM, -EACCES))
1123 return r;
1124 if (r < 0)
1125 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1126 }
1127
1128 return 0;
1129 }
1130
1131 int seccomp_protect_sysctl(void) {
1132 uint32_t arch;
1133 int r;
1134
1135 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1136 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1137
1138 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1139
1140 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1141 /* No _sysctl syscall */
1142 continue;
1143
1144 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1145 if (r < 0)
1146 return r;
1147
1148 r = seccomp_rule_add_exact(
1149 seccomp,
1150 SCMP_ACT_ERRNO(EPERM),
1151 SCMP_SYS(_sysctl),
1152 0);
1153 if (r < 0) {
1154 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1155 continue;
1156 }
1157
1158 r = seccomp_load(seccomp);
1159 if (IN_SET(r, -EPERM, -EACCES))
1160 return r;
1161 if (r < 0)
1162 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1163 }
1164
1165 return 0;
1166 }
1167
1168 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1169 uint32_t arch;
1170 int r;
1171
1172 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1173 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1174 bool supported;
1175 Iterator i;
1176
1177 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1178
1179 switch (arch) {
1180
1181 case SCMP_ARCH_X86_64:
1182 case SCMP_ARCH_X32:
1183 case SCMP_ARCH_ARM:
1184 case SCMP_ARCH_AARCH64:
1185 case SCMP_ARCH_PPC:
1186 case SCMP_ARCH_PPC64:
1187 case SCMP_ARCH_PPC64LE:
1188 case SCMP_ARCH_MIPSEL64N32:
1189 case SCMP_ARCH_MIPS64N32:
1190 case SCMP_ARCH_MIPSEL64:
1191 case SCMP_ARCH_MIPS64:
1192 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1193 supported = true;
1194 break;
1195
1196 case SCMP_ARCH_S390:
1197 case SCMP_ARCH_S390X:
1198 case SCMP_ARCH_X86:
1199 case SCMP_ARCH_MIPSEL:
1200 case SCMP_ARCH_MIPS:
1201 default:
1202 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1203 * don't know */
1204 supported = false;
1205 break;
1206 }
1207
1208 if (!supported)
1209 continue;
1210
1211 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1212 if (r < 0)
1213 return r;
1214
1215 if (whitelist) {
1216 int af, first = 0, last = 0;
1217 void *afp;
1218
1219 /* If this is a whitelist, we first block the address families that are out of range and then
1220 * everything that is not in the set. First, we find the lowest and highest address family in
1221 * the set. */
1222
1223 SET_FOREACH(afp, address_families, i) {
1224 af = PTR_TO_INT(afp);
1225
1226 if (af <= 0 || af >= af_max())
1227 continue;
1228
1229 if (first == 0 || af < first)
1230 first = af;
1231
1232 if (last == 0 || af > last)
1233 last = af;
1234 }
1235
1236 assert((first == 0) == (last == 0));
1237
1238 if (first == 0) {
1239
1240 /* No entries in the valid range, block everything */
1241 r = seccomp_rule_add_exact(
1242 seccomp,
1243 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1244 SCMP_SYS(socket),
1245 0);
1246 if (r < 0) {
1247 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1248 continue;
1249 }
1250
1251 } else {
1252
1253 /* Block everything below the first entry */
1254 r = seccomp_rule_add_exact(
1255 seccomp,
1256 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1257 SCMP_SYS(socket),
1258 1,
1259 SCMP_A0(SCMP_CMP_LT, first));
1260 if (r < 0) {
1261 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1262 continue;
1263 }
1264
1265 /* Block everything above the last entry */
1266 r = seccomp_rule_add_exact(
1267 seccomp,
1268 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1269 SCMP_SYS(socket),
1270 1,
1271 SCMP_A0(SCMP_CMP_GT, last));
1272 if (r < 0) {
1273 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1274 continue;
1275 }
1276
1277 /* Block everything between the first and last entry */
1278 for (af = 1; af < af_max(); af++) {
1279
1280 if (set_contains(address_families, INT_TO_PTR(af)))
1281 continue;
1282
1283 r = seccomp_rule_add_exact(
1284 seccomp,
1285 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1286 SCMP_SYS(socket),
1287 1,
1288 SCMP_A0(SCMP_CMP_EQ, af));
1289 if (r < 0)
1290 break;
1291 }
1292 if (r < 0) {
1293 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1294 continue;
1295 }
1296 }
1297
1298 } else {
1299 void *af;
1300
1301 /* If this is a blacklist, then generate one rule for
1302 * each address family that are then combined in OR
1303 * checks. */
1304
1305 SET_FOREACH(af, address_families, i) {
1306
1307 r = seccomp_rule_add_exact(
1308 seccomp,
1309 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1310 SCMP_SYS(socket),
1311 1,
1312 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1313 if (r < 0)
1314 break;
1315 }
1316 if (r < 0) {
1317 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1318 continue;
1319 }
1320 }
1321
1322 r = seccomp_load(seccomp);
1323 if (IN_SET(r, -EPERM, -EACCES))
1324 return r;
1325 if (r < 0)
1326 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1327 }
1328
1329 return 0;
1330 }
1331
1332 int seccomp_restrict_realtime(void) {
1333 static const int permitted_policies[] = {
1334 SCHED_OTHER,
1335 SCHED_BATCH,
1336 SCHED_IDLE,
1337 };
1338
1339 int r, max_policy = 0;
1340 uint32_t arch;
1341 unsigned i;
1342
1343 /* Determine the highest policy constant we want to allow */
1344 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1345 if (permitted_policies[i] > max_policy)
1346 max_policy = permitted_policies[i];
1347
1348 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1349 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1350 int p;
1351
1352 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1353
1354 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1355 if (r < 0)
1356 return r;
1357
1358 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1359 * whitelist. */
1360 for (p = 0; p < max_policy; p++) {
1361 bool good = false;
1362
1363 /* Check if this is in the whitelist. */
1364 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1365 if (permitted_policies[i] == p) {
1366 good = true;
1367 break;
1368 }
1369
1370 if (good)
1371 continue;
1372
1373 /* Deny this policy */
1374 r = seccomp_rule_add_exact(
1375 seccomp,
1376 SCMP_ACT_ERRNO(EPERM),
1377 SCMP_SYS(sched_setscheduler),
1378 1,
1379 SCMP_A1(SCMP_CMP_EQ, p));
1380 if (r < 0) {
1381 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1382 continue;
1383 }
1384 }
1385
1386 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1387 * unsigned here, hence no need no check for < 0 values. */
1388 r = seccomp_rule_add_exact(
1389 seccomp,
1390 SCMP_ACT_ERRNO(EPERM),
1391 SCMP_SYS(sched_setscheduler),
1392 1,
1393 SCMP_A1(SCMP_CMP_GT, max_policy));
1394 if (r < 0) {
1395 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1396 continue;
1397 }
1398
1399 r = seccomp_load(seccomp);
1400 if (IN_SET(r, -EPERM, -EACCES))
1401 return r;
1402 if (r < 0)
1403 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1404 }
1405
1406 return 0;
1407 }
1408
1409 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1410 uint32_t arch,
1411 int nr,
1412 unsigned int arg_cnt,
1413 const struct scmp_arg_cmp arg) {
1414 int r;
1415
1416 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1417 if (r < 0) {
1418 _cleanup_free_ char *n = NULL;
1419
1420 n = seccomp_syscall_resolve_num_arch(arch, nr);
1421 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1422 strna(n),
1423 seccomp_arch_to_string(arch));
1424 }
1425
1426 return r;
1427 }
1428
1429 /* For known architectures, check that syscalls are indeed defined or not. */
1430 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1431 assert_cc(SCMP_SYS(shmget) > 0);
1432 assert_cc(SCMP_SYS(shmat) > 0);
1433 assert_cc(SCMP_SYS(shmdt) > 0);
1434 #elif defined(__i386__) || defined(__powerpc64__)
1435 assert_cc(SCMP_SYS(shmget) < 0);
1436 assert_cc(SCMP_SYS(shmat) < 0);
1437 assert_cc(SCMP_SYS(shmdt) < 0);
1438 #endif
1439
1440 int seccomp_memory_deny_write_execute(void) {
1441
1442 uint32_t arch;
1443 int r;
1444
1445 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1446 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1447 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1448
1449 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1450
1451 switch (arch) {
1452
1453 case SCMP_ARCH_X86:
1454 filter_syscall = SCMP_SYS(mmap2);
1455 block_syscall = SCMP_SYS(mmap);
1456 break;
1457
1458 case SCMP_ARCH_PPC:
1459 case SCMP_ARCH_PPC64:
1460 case SCMP_ARCH_PPC64LE:
1461 filter_syscall = SCMP_SYS(mmap);
1462
1463 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1464 * We ignore that here, which means there's still a way to get writable/executable
1465 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1466
1467 break;
1468
1469 case SCMP_ARCH_ARM:
1470 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1471 shmat_syscall = SCMP_SYS(shmat);
1472 break;
1473
1474 case SCMP_ARCH_X86_64:
1475 case SCMP_ARCH_X32:
1476 case SCMP_ARCH_AARCH64:
1477 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1478 shmat_syscall = SCMP_SYS(shmat);
1479 break;
1480
1481 /* Please add more definitions here, if you port systemd to other architectures! */
1482
1483 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1484 #warning "Consider adding the right mmap() syscall definitions here!"
1485 #endif
1486 }
1487
1488 /* Can't filter mmap() on this arch, then skip it */
1489 if (filter_syscall == 0)
1490 continue;
1491
1492 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1493 if (r < 0)
1494 return r;
1495
1496 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1497 1,
1498 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1499 if (r < 0)
1500 continue;
1501
1502 if (block_syscall != 0) {
1503 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1504 if (r < 0)
1505 continue;
1506 }
1507
1508 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1509 1,
1510 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1511 if (r < 0)
1512 continue;
1513
1514 #ifdef __NR_pkey_mprotect
1515 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1516 1,
1517 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1518 if (r < 0)
1519 continue;
1520 #endif
1521
1522 if (shmat_syscall != 0) {
1523 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1524 1,
1525 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1526 if (r < 0)
1527 continue;
1528 }
1529
1530 r = seccomp_load(seccomp);
1531 if (IN_SET(r, -EPERM, -EACCES))
1532 return r;
1533 if (r < 0)
1534 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1535 }
1536
1537 return 0;
1538 }
1539
1540 int seccomp_restrict_archs(Set *archs) {
1541 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1542 Iterator i;
1543 void *id;
1544 int r;
1545
1546 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1547 * list.
1548 *
1549 * There are some qualifications. However the most important use is to stop processes from bypassing
1550 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1551 * in a non-native architecture. There are no holes in this use case, at least so far. */
1552
1553 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1554 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1555 * to run a program with the restrictions applied. */
1556 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1557 if (!seccomp)
1558 return -ENOMEM;
1559
1560 SET_FOREACH(id, archs, i) {
1561 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1562 if (r < 0 && r != -EEXIST)
1563 return r;
1564 }
1565
1566 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1567 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1568 * The important thing is that you can block the old 32-bit x86 syscalls.
1569 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1570
1571 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1572 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1573
1574 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1575 if (r < 0 && r != -EEXIST)
1576 return r;
1577 }
1578
1579 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1580 if (r < 0)
1581 return r;
1582
1583 r = seccomp_load(seccomp);
1584 if (IN_SET(r, -EPERM, -EACCES))
1585 return r;
1586 if (r < 0)
1587 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1588
1589 return 0;
1590 }
1591
1592 int parse_syscall_archs(char **l, Set **archs) {
1593 _cleanup_set_free_ Set *_archs;
1594 char **s;
1595 int r;
1596
1597 assert(l);
1598 assert(archs);
1599
1600 r = set_ensure_allocated(&_archs, NULL);
1601 if (r < 0)
1602 return r;
1603
1604 STRV_FOREACH(s, l) {
1605 uint32_t a;
1606
1607 r = seccomp_arch_from_string(*s, &a);
1608 if (r < 0)
1609 return -EINVAL;
1610
1611 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1612 if (r < 0)
1613 return -ENOMEM;
1614 }
1615
1616 *archs = _archs;
1617 _archs = NULL;
1618
1619 return 0;
1620 }
1621
1622 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1623 const char *i;
1624 int r;
1625
1626 assert(set);
1627
1628 NULSTR_FOREACH(i, set->value) {
1629
1630 if (i[0] == '@') {
1631 const SyscallFilterSet *more;
1632
1633 more = syscall_filter_set_find(i);
1634 if (!more)
1635 return -ENXIO;
1636
1637 r = seccomp_filter_set_add(filter, add, more);
1638 if (r < 0)
1639 return r;
1640 } else {
1641 int id;
1642
1643 id = seccomp_syscall_resolve_name(i);
1644 if (id == __NR_SCMP_ERROR) {
1645 log_debug("Couldn't resolve system call, ignoring: %s", i);
1646 continue;
1647 }
1648
1649 if (add) {
1650 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1651 if (r < 0)
1652 return r;
1653 } else
1654 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1655 }
1656 }
1657
1658 return 0;
1659 }
1660
1661 int seccomp_lock_personality(unsigned long personality) {
1662 uint32_t arch;
1663 int r;
1664
1665 if (personality >= PERSONALITY_INVALID)
1666 return -EINVAL;
1667
1668 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1669 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1670
1671 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1672 if (r < 0)
1673 return r;
1674
1675 r = seccomp_rule_add_exact(
1676 seccomp,
1677 SCMP_ACT_ERRNO(EPERM),
1678 SCMP_SYS(personality),
1679 1,
1680 SCMP_A0(SCMP_CMP_NE, personality));
1681 if (r < 0) {
1682 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1683 continue;
1684 }
1685
1686 r = seccomp_load(seccomp);
1687 if (IN_SET(r, -EPERM, -EACCES))
1688 return r;
1689 if (r < 0)
1690 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1691 }
1692
1693 return 0;
1694 }