]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
seccomp: arm64 does not have mmap2
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <linux/seccomp.h>
22 #include <seccomp.h>
23 #include <stddef.h>
24 #include <sys/mman.h>
25 #include <sys/prctl.h>
26 #include <sys/shm.h>
27
28 #include "af-list.h"
29 #include "alloc-util.h"
30 #include "macro.h"
31 #include "nsflags.h"
32 #include "seccomp-util.h"
33 #include "string-util.h"
34 #include "util.h"
35 #include "errno-list.h"
36
37 const uint32_t seccomp_local_archs[] = {
38
39 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
40
41 #if defined(__x86_64__) && defined(__ILP32__)
42 SCMP_ARCH_X86,
43 SCMP_ARCH_X86_64,
44 SCMP_ARCH_X32, /* native */
45 #elif defined(__x86_64__) && !defined(__ILP32__)
46 SCMP_ARCH_X86,
47 SCMP_ARCH_X32,
48 SCMP_ARCH_X86_64, /* native */
49 #elif defined(__i386__)
50 SCMP_ARCH_X86,
51 #elif defined(__aarch64__)
52 SCMP_ARCH_ARM,
53 SCMP_ARCH_AARCH64, /* native */
54 #elif defined(__arm__)
55 SCMP_ARCH_ARM,
56 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
57 SCMP_ARCH_MIPSEL,
58 SCMP_ARCH_MIPS, /* native */
59 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
60 SCMP_ARCH_MIPS,
61 SCMP_ARCH_MIPSEL, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
63 SCMP_ARCH_MIPSEL,
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL64N32,
66 SCMP_ARCH_MIPS64N32,
67 SCMP_ARCH_MIPSEL64,
68 SCMP_ARCH_MIPS64, /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
70 SCMP_ARCH_MIPS,
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS64N32,
73 SCMP_ARCH_MIPSEL64N32,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64, /* native */
76 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
77 SCMP_ARCH_MIPSEL,
78 SCMP_ARCH_MIPS,
79 SCMP_ARCH_MIPSEL64,
80 SCMP_ARCH_MIPS64,
81 SCMP_ARCH_MIPSEL64N32,
82 SCMP_ARCH_MIPS64N32, /* native */
83 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
84 SCMP_ARCH_MIPS,
85 SCMP_ARCH_MIPSEL,
86 SCMP_ARCH_MIPS64,
87 SCMP_ARCH_MIPSEL64,
88 SCMP_ARCH_MIPS64N32,
89 SCMP_ARCH_MIPSEL64N32, /* native */
90 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
91 SCMP_ARCH_PPC,
92 SCMP_ARCH_PPC64LE,
93 SCMP_ARCH_PPC64, /* native */
94 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
95 SCMP_ARCH_PPC,
96 SCMP_ARCH_PPC64,
97 SCMP_ARCH_PPC64LE, /* native */
98 #elif defined(__powerpc__)
99 SCMP_ARCH_PPC,
100 #elif defined(__s390x__)
101 SCMP_ARCH_S390,
102 SCMP_ARCH_S390X, /* native */
103 #elif defined(__s390__)
104 SCMP_ARCH_S390,
105 #endif
106 (uint32_t) -1
107 };
108
109 const char* seccomp_arch_to_string(uint32_t c) {
110 /* Maintain order used in <seccomp.h>.
111 *
112 * Names used here should be the same as those used for ConditionArchitecture=,
113 * except for "subarchitectures" like x32. */
114
115 switch(c) {
116 case SCMP_ARCH_NATIVE:
117 return "native";
118 case SCMP_ARCH_X86:
119 return "x86";
120 case SCMP_ARCH_X86_64:
121 return "x86-64";
122 case SCMP_ARCH_X32:
123 return "x32";
124 case SCMP_ARCH_ARM:
125 return "arm";
126 case SCMP_ARCH_AARCH64:
127 return "arm64";
128 case SCMP_ARCH_MIPS:
129 return "mips";
130 case SCMP_ARCH_MIPS64:
131 return "mips64";
132 case SCMP_ARCH_MIPS64N32:
133 return "mips64-n32";
134 case SCMP_ARCH_MIPSEL:
135 return "mips-le";
136 case SCMP_ARCH_MIPSEL64:
137 return "mips64-le";
138 case SCMP_ARCH_MIPSEL64N32:
139 return "mips64-le-n32";
140 case SCMP_ARCH_PPC:
141 return "ppc";
142 case SCMP_ARCH_PPC64:
143 return "ppc64";
144 case SCMP_ARCH_PPC64LE:
145 return "ppc64-le";
146 case SCMP_ARCH_S390:
147 return "s390";
148 case SCMP_ARCH_S390X:
149 return "s390x";
150 default:
151 return NULL;
152 }
153 }
154
155 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
156 if (!n)
157 return -EINVAL;
158
159 assert(ret);
160
161 if (streq(n, "native"))
162 *ret = SCMP_ARCH_NATIVE;
163 else if (streq(n, "x86"))
164 *ret = SCMP_ARCH_X86;
165 else if (streq(n, "x86-64"))
166 *ret = SCMP_ARCH_X86_64;
167 else if (streq(n, "x32"))
168 *ret = SCMP_ARCH_X32;
169 else if (streq(n, "arm"))
170 *ret = SCMP_ARCH_ARM;
171 else if (streq(n, "arm64"))
172 *ret = SCMP_ARCH_AARCH64;
173 else if (streq(n, "mips"))
174 *ret = SCMP_ARCH_MIPS;
175 else if (streq(n, "mips64"))
176 *ret = SCMP_ARCH_MIPS64;
177 else if (streq(n, "mips64-n32"))
178 *ret = SCMP_ARCH_MIPS64N32;
179 else if (streq(n, "mips-le"))
180 *ret = SCMP_ARCH_MIPSEL;
181 else if (streq(n, "mips64-le"))
182 *ret = SCMP_ARCH_MIPSEL64;
183 else if (streq(n, "mips64-le-n32"))
184 *ret = SCMP_ARCH_MIPSEL64N32;
185 else if (streq(n, "ppc"))
186 *ret = SCMP_ARCH_PPC;
187 else if (streq(n, "ppc64"))
188 *ret = SCMP_ARCH_PPC64;
189 else if (streq(n, "ppc64-le"))
190 *ret = SCMP_ARCH_PPC64LE;
191 else if (streq(n, "s390"))
192 *ret = SCMP_ARCH_S390;
193 else if (streq(n, "s390x"))
194 *ret = SCMP_ARCH_S390X;
195 else
196 return -EINVAL;
197
198 return 0;
199 }
200
201 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
202 scmp_filter_ctx seccomp;
203 int r;
204
205 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
206 * any others. Also, turns off the NNP fiddling. */
207
208 seccomp = seccomp_init(default_action);
209 if (!seccomp)
210 return -ENOMEM;
211
212 if (arch != SCMP_ARCH_NATIVE &&
213 arch != seccomp_arch_native()) {
214
215 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
216 if (r < 0)
217 goto finish;
218
219 r = seccomp_arch_add(seccomp, arch);
220 if (r < 0)
221 goto finish;
222
223 assert(seccomp_arch_exist(seccomp, arch) >= 0);
224 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
225 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
226 } else {
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
229 }
230
231 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
232 if (r < 0)
233 goto finish;
234
235 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
236 if (r < 0)
237 goto finish;
238
239 *ret = seccomp;
240 return 0;
241
242 finish:
243 seccomp_release(seccomp);
244 return r;
245 }
246
247 static bool is_basic_seccomp_available(void) {
248 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
249 }
250
251 static bool is_seccomp_filter_available(void) {
252 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
253 errno == EFAULT;
254 }
255
256 bool is_seccomp_available(void) {
257 static int cached_enabled = -1;
258
259 if (cached_enabled < 0)
260 cached_enabled =
261 is_basic_seccomp_available() &&
262 is_seccomp_filter_available();
263
264 return cached_enabled;
265 }
266
267 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
268 [SYSCALL_FILTER_SET_DEFAULT] = {
269 .name = "@default",
270 .help = "System calls that are always permitted",
271 .value =
272 "clock_getres\0"
273 "clock_gettime\0"
274 "clock_nanosleep\0"
275 "execve\0"
276 "exit\0"
277 "exit_group\0"
278 "getrlimit\0" /* make sure processes can query stack size and such */
279 "gettimeofday\0"
280 "nanosleep\0"
281 "pause\0"
282 "rt_sigreturn\0"
283 "sigreturn\0"
284 "time\0"
285 },
286 [SYSCALL_FILTER_SET_BASIC_IO] = {
287 .name = "@basic-io",
288 .help = "Basic IO",
289 .value =
290 "close\0"
291 "dup2\0"
292 "dup3\0"
293 "dup\0"
294 "lseek\0"
295 "pread64\0"
296 "preadv\0"
297 "pwrite64\0"
298 "pwritev\0"
299 "read\0"
300 "readv\0"
301 "write\0"
302 "writev\0"
303 },
304 [SYSCALL_FILTER_SET_CLOCK] = {
305 .name = "@clock",
306 .help = "Change the system time",
307 .value =
308 "adjtimex\0"
309 "clock_adjtime\0"
310 "clock_settime\0"
311 "settimeofday\0"
312 "stime\0"
313 },
314 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
315 .name = "@cpu-emulation",
316 .help = "System calls for CPU emulation functionality",
317 .value =
318 "modify_ldt\0"
319 "subpage_prot\0"
320 "switch_endian\0"
321 "vm86\0"
322 "vm86old\0"
323 },
324 [SYSCALL_FILTER_SET_DEBUG] = {
325 .name = "@debug",
326 .help = "Debugging, performance monitoring and tracing functionality",
327 .value =
328 "lookup_dcookie\0"
329 "perf_event_open\0"
330 "process_vm_readv\0"
331 "process_vm_writev\0"
332 "ptrace\0"
333 "rtas\0"
334 #ifdef __NR_s390_runtime_instr
335 "s390_runtime_instr\0"
336 #endif
337 "sys_debug_setcontext\0"
338 },
339 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
340 .name = "@file-system",
341 .help = "File system operations",
342 .value =
343 "access\0"
344 "chdir\0"
345 "chmod\0"
346 "close\0"
347 "creat\0"
348 "faccessat\0"
349 "fallocate\0"
350 "fchdir\0"
351 "fchmod\0"
352 "fchmodat\0"
353 "fcntl64\0"
354 "fcntl\0"
355 "fgetxattr\0"
356 "flistxattr\0"
357 "fsetxattr\0"
358 "fstat64\0"
359 "fstat\0"
360 "fstatat64\0"
361 "fstatfs64\0"
362 "fstatfs\0"
363 "ftruncate64\0"
364 "ftruncate\0"
365 "futimesat\0"
366 "getcwd\0"
367 "getdents64\0"
368 "getdents\0"
369 "getxattr\0"
370 "inotify_add_watch\0"
371 "inotify_init1\0"
372 "inotify_rm_watch\0"
373 "lgetxattr\0"
374 "link\0"
375 "linkat\0"
376 "listxattr\0"
377 "llistxattr\0"
378 "lremovexattr\0"
379 "lsetxattr\0"
380 "lstat64\0"
381 "lstat\0"
382 "mkdir\0"
383 "mkdirat\0"
384 "mknod\0"
385 "mknodat\0"
386 "mmap2\0"
387 "mmap\0"
388 "munmap\0"
389 "newfstatat\0"
390 "open\0"
391 "openat\0"
392 "readlink\0"
393 "readlinkat\0"
394 "removexattr\0"
395 "rename\0"
396 "renameat2\0"
397 "renameat\0"
398 "rmdir\0"
399 "setxattr\0"
400 "stat64\0"
401 "stat\0"
402 "statfs\0"
403 "symlink\0"
404 "symlinkat\0"
405 "truncate64\0"
406 "truncate\0"
407 "unlink\0"
408 "unlinkat\0"
409 "utimensat\0"
410 "utimes\0"
411 },
412 [SYSCALL_FILTER_SET_IO_EVENT] = {
413 .name = "@io-event",
414 .help = "Event loop system calls",
415 .value =
416 "_newselect\0"
417 "epoll_create1\0"
418 "epoll_create\0"
419 "epoll_ctl\0"
420 "epoll_ctl_old\0"
421 "epoll_pwait\0"
422 "epoll_wait\0"
423 "epoll_wait_old\0"
424 "eventfd2\0"
425 "eventfd\0"
426 "poll\0"
427 "ppoll\0"
428 "pselect6\0"
429 "select\0"
430 },
431 [SYSCALL_FILTER_SET_IPC] = {
432 .name = "@ipc",
433 .help = "SysV IPC, POSIX Message Queues or other IPC",
434 .value =
435 "ipc\0"
436 "memfd_create\0"
437 "mq_getsetattr\0"
438 "mq_notify\0"
439 "mq_open\0"
440 "mq_timedreceive\0"
441 "mq_timedsend\0"
442 "mq_unlink\0"
443 "msgctl\0"
444 "msgget\0"
445 "msgrcv\0"
446 "msgsnd\0"
447 "pipe2\0"
448 "pipe\0"
449 "process_vm_readv\0"
450 "process_vm_writev\0"
451 "semctl\0"
452 "semget\0"
453 "semop\0"
454 "semtimedop\0"
455 "shmat\0"
456 "shmctl\0"
457 "shmdt\0"
458 "shmget\0"
459 },
460 [SYSCALL_FILTER_SET_KEYRING] = {
461 .name = "@keyring",
462 .help = "Kernel keyring access",
463 .value =
464 "add_key\0"
465 "keyctl\0"
466 "request_key\0"
467 },
468 [SYSCALL_FILTER_SET_MODULE] = {
469 .name = "@module",
470 .help = "Loading and unloading of kernel modules",
471 .value =
472 "delete_module\0"
473 "finit_module\0"
474 "init_module\0"
475 },
476 [SYSCALL_FILTER_SET_MOUNT] = {
477 .name = "@mount",
478 .help = "Mounting and unmounting of file systems",
479 .value =
480 "chroot\0"
481 "mount\0"
482 "pivot_root\0"
483 "umount2\0"
484 "umount\0"
485 },
486 [SYSCALL_FILTER_SET_NETWORK_IO] = {
487 .name = "@network-io",
488 .help = "Network or Unix socket IO, should not be needed if not network facing",
489 .value =
490 "accept4\0"
491 "accept\0"
492 "bind\0"
493 "connect\0"
494 "getpeername\0"
495 "getsockname\0"
496 "getsockopt\0"
497 "listen\0"
498 "recv\0"
499 "recvfrom\0"
500 "recvmmsg\0"
501 "recvmsg\0"
502 "send\0"
503 "sendmmsg\0"
504 "sendmsg\0"
505 "sendto\0"
506 "setsockopt\0"
507 "shutdown\0"
508 "socket\0"
509 "socketcall\0"
510 "socketpair\0"
511 },
512 [SYSCALL_FILTER_SET_OBSOLETE] = {
513 /* some unknown even to libseccomp */
514 .name = "@obsolete",
515 .help = "Unusual, obsolete or unimplemented system calls",
516 .value =
517 "_sysctl\0"
518 "afs_syscall\0"
519 "bdflush\0"
520 "break\0"
521 "create_module\0"
522 "ftime\0"
523 "get_kernel_syms\0"
524 "getpmsg\0"
525 "gtty\0"
526 "lock\0"
527 "mpx\0"
528 "prof\0"
529 "profil\0"
530 "putpmsg\0"
531 "query_module\0"
532 "security\0"
533 "sgetmask\0"
534 "ssetmask\0"
535 "stty\0"
536 "sysfs\0"
537 "tuxcall\0"
538 "ulimit\0"
539 "uselib\0"
540 "ustat\0"
541 "vserver\0"
542 },
543 [SYSCALL_FILTER_SET_PRIVILEGED] = {
544 .name = "@privileged",
545 .help = "All system calls which need super-user capabilities",
546 .value =
547 "@clock\0"
548 "@module\0"
549 "@raw-io\0"
550 "acct\0"
551 "bpf\0"
552 "capset\0"
553 "chown32\0"
554 "chown\0"
555 "chroot\0"
556 "fchown32\0"
557 "fchown\0"
558 "fchownat\0"
559 "kexec_file_load\0"
560 "kexec_load\0"
561 "lchown32\0"
562 "lchown\0"
563 "nfsservctl\0"
564 "pivot_root\0"
565 "quotactl\0"
566 "reboot\0"
567 "setdomainname\0"
568 "setfsuid32\0"
569 "setfsuid\0"
570 "setgroups32\0"
571 "setgroups\0"
572 "sethostname\0"
573 "setresuid32\0"
574 "setresuid\0"
575 "setreuid32\0"
576 "setreuid\0"
577 "setuid32\0"
578 "setuid\0"
579 "swapoff\0"
580 "swapon\0"
581 "_sysctl\0"
582 "vhangup\0"
583 },
584 [SYSCALL_FILTER_SET_PROCESS] = {
585 .name = "@process",
586 .help = "Process control, execution, namespaceing operations",
587 .value =
588 "arch_prctl\0"
589 "clone\0"
590 "execveat\0"
591 "fork\0"
592 "kill\0"
593 "prctl\0"
594 "setns\0"
595 "tgkill\0"
596 "tkill\0"
597 "unshare\0"
598 "vfork\0"
599 },
600 [SYSCALL_FILTER_SET_RAW_IO] = {
601 .name = "@raw-io",
602 .help = "Raw I/O port access",
603 .value =
604 "ioperm\0"
605 "iopl\0"
606 "pciconfig_iobase\0"
607 "pciconfig_read\0"
608 "pciconfig_write\0"
609 #ifdef __NR_s390_pci_mmio_read
610 "s390_pci_mmio_read\0"
611 #endif
612 #ifdef __NR_s390_pci_mmio_write
613 "s390_pci_mmio_write\0"
614 #endif
615 },
616 [SYSCALL_FILTER_SET_REBOOT] = {
617 .name = "@reboot",
618 .help = "Reboot and reboot preparation/kexec",
619 .value =
620 "kexec\0"
621 "kexec_file_load\0"
622 "reboot\0"
623 },
624 [SYSCALL_FILTER_SET_RESOURCES] = {
625 .name = "@resources",
626 .help = "Alter resource settings",
627 .value =
628 "sched_setparam\0"
629 "sched_setscheduler\0"
630 "sched_setaffinity\0"
631 "setpriority\0"
632 "setrlimit\0"
633 "set_mempolicy\0"
634 "migrate_pages\0"
635 "move_pages\0"
636 "mbind\0"
637 "sched_setattr\0"
638 "prlimit64\0"
639 },
640 [SYSCALL_FILTER_SET_SWAP] = {
641 .name = "@swap",
642 .help = "Enable/disable swap devices",
643 .value =
644 "swapoff\0"
645 "swapon\0"
646 },
647 };
648
649 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
650 unsigned i;
651
652 if (isempty(name) || name[0] != '@')
653 return NULL;
654
655 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
656 if (streq(syscall_filter_sets[i].name, name))
657 return syscall_filter_sets + i;
658
659 return NULL;
660 }
661
662 static int seccomp_add_syscall_filter_set(
663 scmp_filter_ctx seccomp,
664 uint32_t default_action,
665 const SyscallFilterSet *set,
666 uint32_t action) {
667
668 const char *sys;
669 int r;
670
671 assert(seccomp);
672 assert(set);
673
674 NULSTR_FOREACH(sys, set->value) {
675 int id;
676
677 if (sys[0] == '@') {
678 const SyscallFilterSet *other;
679
680 other = syscall_filter_set_find(sys);
681 if (!other)
682 return -EINVAL;
683
684 r = seccomp_add_syscall_filter_set(seccomp, default_action, other, action);
685 if (r < 0)
686 return r;
687 } else {
688 id = seccomp_syscall_resolve_name(sys);
689 if (id == __NR_SCMP_ERROR)
690 return -EINVAL; /* Not known at all? Then that's a real error */
691
692 r = seccomp_rule_add_exact(seccomp, action, id, 0);
693 if (r < 0)
694 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
695 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", sys, id);
696 }
697 }
698
699 return 0;
700 }
701
702 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
703 uint32_t arch;
704 int r;
705
706 assert(set);
707
708 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
709 * earch local arch. */
710
711 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
712 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
713
714 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
715
716 r = seccomp_init_for_arch(&seccomp, arch, default_action);
717 if (r < 0)
718 return r;
719
720 r = seccomp_add_syscall_filter_set(seccomp, default_action, set, action);
721 if (r < 0) {
722 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
723 continue;
724 }
725
726 r = seccomp_load(seccomp);
727 if (IN_SET(r, -EPERM, -EACCES))
728 return r;
729 if (r < 0)
730 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
731 }
732
733 return 0;
734 }
735
736 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
737 uint32_t arch;
738 int r;
739
740 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
741 * SyscallFilterSet* table. */
742
743 if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
744 return 0;
745
746 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
747 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
748 Iterator i;
749 void *id;
750
751 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
752
753 r = seccomp_init_for_arch(&seccomp, arch, default_action);
754 if (r < 0)
755 return r;
756
757 SET_FOREACH(id, set, i) {
758 r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
759 if (r < 0) {
760 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
761 _cleanup_free_ char *n = NULL;
762
763 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
764 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
765 }
766 }
767
768 r = seccomp_load(seccomp);
769 if (IN_SET(r, -EPERM, -EACCES))
770 return r;
771 if (r < 0)
772 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
773 }
774
775 return 0;
776 }
777
778 int seccomp_restrict_namespaces(unsigned long retain) {
779 uint32_t arch;
780 int r;
781
782 if (log_get_max_level() >= LOG_DEBUG) {
783 _cleanup_free_ char *s = NULL;
784
785 (void) namespace_flag_to_string_many(retain, &s);
786 log_debug("Restricting namespace to: %s.", strna(s));
787 }
788
789 /* NOOP? */
790 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
791 return 0;
792
793 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
794 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
795 unsigned i;
796
797 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
798
799 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
800 if (r < 0)
801 return r;
802
803 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
804 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
805 * altogether. */
806 r = seccomp_rule_add_exact(
807 seccomp,
808 SCMP_ACT_ERRNO(EPERM),
809 SCMP_SYS(setns),
810 0);
811 else
812 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
813 * special invocation with a zero flags argument, right here. */
814 r = seccomp_rule_add_exact(
815 seccomp,
816 SCMP_ACT_ERRNO(EPERM),
817 SCMP_SYS(setns),
818 1,
819 SCMP_A1(SCMP_CMP_EQ, 0));
820 if (r < 0) {
821 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
822 continue;
823 }
824
825 for (i = 0; namespace_flag_map[i].name; i++) {
826 unsigned long f;
827
828 f = namespace_flag_map[i].flag;
829 if ((retain & f) == f) {
830 log_debug("Permitting %s.", namespace_flag_map[i].name);
831 continue;
832 }
833
834 log_debug("Blocking %s.", namespace_flag_map[i].name);
835
836 r = seccomp_rule_add_exact(
837 seccomp,
838 SCMP_ACT_ERRNO(EPERM),
839 SCMP_SYS(unshare),
840 1,
841 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
842 if (r < 0) {
843 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
844 break;
845 }
846
847 /* On s390/s390x the first two parameters to clone are switched */
848 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
849 r = seccomp_rule_add_exact(
850 seccomp,
851 SCMP_ACT_ERRNO(EPERM),
852 SCMP_SYS(clone),
853 1,
854 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
855 else
856 r = seccomp_rule_add_exact(
857 seccomp,
858 SCMP_ACT_ERRNO(EPERM),
859 SCMP_SYS(clone),
860 1,
861 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
862 if (r < 0) {
863 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
864 break;
865 }
866
867 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
868 r = seccomp_rule_add_exact(
869 seccomp,
870 SCMP_ACT_ERRNO(EPERM),
871 SCMP_SYS(setns),
872 1,
873 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
874 if (r < 0) {
875 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
876 break;
877 }
878 }
879 }
880 if (r < 0)
881 continue;
882
883 r = seccomp_load(seccomp);
884 if (IN_SET(r, -EPERM, -EACCES))
885 return r;
886 if (r < 0)
887 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
888 }
889
890 return 0;
891 }
892
893 int seccomp_protect_sysctl(void) {
894 uint32_t arch;
895 int r;
896
897 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
898 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
899
900 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
901
902 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
903 /* No _sysctl syscall */
904 continue;
905
906 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
907 if (r < 0)
908 return r;
909
910 r = seccomp_rule_add_exact(
911 seccomp,
912 SCMP_ACT_ERRNO(EPERM),
913 SCMP_SYS(_sysctl),
914 0);
915 if (r < 0) {
916 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
917 continue;
918 }
919
920 r = seccomp_load(seccomp);
921 if (IN_SET(r, -EPERM, -EACCES))
922 return r;
923 if (r < 0)
924 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
925 }
926
927 return 0;
928 }
929
930 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
931 uint32_t arch;
932 int r;
933
934 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
935 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
936 bool supported;
937 Iterator i;
938
939 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
940
941 switch (arch) {
942
943 case SCMP_ARCH_X86_64:
944 case SCMP_ARCH_X32:
945 case SCMP_ARCH_ARM:
946 case SCMP_ARCH_AARCH64:
947 case SCMP_ARCH_PPC64:
948 case SCMP_ARCH_PPC64LE:
949 /* These we know we support (i.e. are the ones that do not use socketcall()) */
950 supported = true;
951 break;
952
953 case SCMP_ARCH_S390:
954 case SCMP_ARCH_S390X:
955 case SCMP_ARCH_PPC:
956 case SCMP_ARCH_X86:
957 default:
958 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
959 * don't know */
960 supported = false;
961 break;
962 }
963
964 if (!supported)
965 continue;
966
967 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
968 if (r < 0)
969 return r;
970
971 if (whitelist) {
972 int af, first = 0, last = 0;
973 void *afp;
974
975 /* If this is a whitelist, we first block the address families that are out of range and then
976 * everything that is not in the set. First, we find the lowest and highest address family in
977 * the set. */
978
979 SET_FOREACH(afp, address_families, i) {
980 af = PTR_TO_INT(afp);
981
982 if (af <= 0 || af >= af_max())
983 continue;
984
985 if (first == 0 || af < first)
986 first = af;
987
988 if (last == 0 || af > last)
989 last = af;
990 }
991
992 assert((first == 0) == (last == 0));
993
994 if (first == 0) {
995
996 /* No entries in the valid range, block everything */
997 r = seccomp_rule_add_exact(
998 seccomp,
999 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1000 SCMP_SYS(socket),
1001 0);
1002 if (r < 0) {
1003 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1004 continue;
1005 }
1006
1007 } else {
1008
1009 /* Block everything below the first entry */
1010 r = seccomp_rule_add_exact(
1011 seccomp,
1012 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1013 SCMP_SYS(socket),
1014 1,
1015 SCMP_A0(SCMP_CMP_LT, first));
1016 if (r < 0) {
1017 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1018 continue;
1019 }
1020
1021 /* Block everything above the last entry */
1022 r = seccomp_rule_add_exact(
1023 seccomp,
1024 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1025 SCMP_SYS(socket),
1026 1,
1027 SCMP_A0(SCMP_CMP_GT, last));
1028 if (r < 0) {
1029 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1030 continue;
1031 }
1032
1033 /* Block everything between the first and last entry */
1034 for (af = 1; af < af_max(); af++) {
1035
1036 if (set_contains(address_families, INT_TO_PTR(af)))
1037 continue;
1038
1039 r = seccomp_rule_add_exact(
1040 seccomp,
1041 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1042 SCMP_SYS(socket),
1043 1,
1044 SCMP_A0(SCMP_CMP_EQ, af));
1045 if (r < 0)
1046 break;
1047 }
1048
1049 if (r < 0) {
1050 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1051 continue;
1052 }
1053 }
1054
1055 } else {
1056 void *af;
1057
1058 /* If this is a blacklist, then generate one rule for
1059 * each address family that are then combined in OR
1060 * checks. */
1061
1062 SET_FOREACH(af, address_families, i) {
1063
1064 r = seccomp_rule_add_exact(
1065 seccomp,
1066 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1067 SCMP_SYS(socket),
1068 1,
1069 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1070 if (r < 0)
1071 break;
1072 }
1073
1074 if (r < 0) {
1075 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1076 continue;
1077 }
1078 }
1079
1080 r = seccomp_load(seccomp);
1081 if (IN_SET(r, -EPERM, -EACCES))
1082 return r;
1083 if (r < 0)
1084 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1085 }
1086
1087 return 0;
1088 }
1089
1090 int seccomp_restrict_realtime(void) {
1091 static const int permitted_policies[] = {
1092 SCHED_OTHER,
1093 SCHED_BATCH,
1094 SCHED_IDLE,
1095 };
1096
1097 int r, max_policy = 0;
1098 uint32_t arch;
1099 unsigned i;
1100
1101 /* Determine the highest policy constant we want to allow */
1102 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1103 if (permitted_policies[i] > max_policy)
1104 max_policy = permitted_policies[i];
1105
1106 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1107 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1108 int p;
1109
1110 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1111
1112 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1113 if (r < 0)
1114 return r;
1115
1116 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1117 * whitelist. */
1118 for (p = 0; p < max_policy; p++) {
1119 bool good = false;
1120
1121 /* Check if this is in the whitelist. */
1122 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1123 if (permitted_policies[i] == p) {
1124 good = true;
1125 break;
1126 }
1127
1128 if (good)
1129 continue;
1130
1131 /* Deny this policy */
1132 r = seccomp_rule_add_exact(
1133 seccomp,
1134 SCMP_ACT_ERRNO(EPERM),
1135 SCMP_SYS(sched_setscheduler),
1136 1,
1137 SCMP_A1(SCMP_CMP_EQ, p));
1138 if (r < 0) {
1139 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1140 continue;
1141 }
1142 }
1143
1144 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1145 * unsigned here, hence no need no check for < 0 values. */
1146 r = seccomp_rule_add_exact(
1147 seccomp,
1148 SCMP_ACT_ERRNO(EPERM),
1149 SCMP_SYS(sched_setscheduler),
1150 1,
1151 SCMP_A1(SCMP_CMP_GT, max_policy));
1152 if (r < 0) {
1153 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1154 continue;
1155 }
1156
1157 r = seccomp_load(seccomp);
1158 if (IN_SET(r, -EPERM, -EACCES))
1159 return r;
1160 if (r < 0)
1161 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1162 }
1163
1164 return 0;
1165 }
1166
1167 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1168 uint32_t arch,
1169 int nr,
1170 unsigned int arg_cnt,
1171 const struct scmp_arg_cmp arg) {
1172 int r;
1173
1174 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1175 if (r < 0) {
1176 _cleanup_free_ char *n = NULL;
1177
1178 n = seccomp_syscall_resolve_num_arch(arch, nr);
1179 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1180 strna(n),
1181 seccomp_arch_to_string(arch));
1182 }
1183
1184 return r;
1185 }
1186
1187 /* For known architectures, check that syscalls are indeed defined or not. */
1188 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1189 assert_cc(SCMP_SYS(shmget) > 0);
1190 assert_cc(SCMP_SYS(shmat) > 0);
1191 assert_cc(SCMP_SYS(shmdt) > 0);
1192 #elif defined(__i386__) || defined(__powerpc64__)
1193 assert_cc(SCMP_SYS(shmget) < 0);
1194 assert_cc(SCMP_SYS(shmat) < 0);
1195 assert_cc(SCMP_SYS(shmdt) < 0);
1196 #endif
1197
1198 int seccomp_memory_deny_write_execute(void) {
1199
1200 uint32_t arch;
1201 int r;
1202
1203 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1204 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1205 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1206
1207 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1208
1209 switch (arch) {
1210
1211 case SCMP_ARCH_X86:
1212 filter_syscall = SCMP_SYS(mmap2);
1213 block_syscall = SCMP_SYS(mmap);
1214 break;
1215
1216 case SCMP_ARCH_PPC64:
1217 case SCMP_ARCH_PPC64LE:
1218 filter_syscall = SCMP_SYS(mmap);
1219
1220 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1221 * We ignore that here, which means there's still a way to get writable/executable
1222 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1223
1224 break;
1225
1226 case SCMP_ARCH_ARM:
1227 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1228 shmat_syscall = SCMP_SYS(shmat);
1229 break;
1230
1231 case SCMP_ARCH_X86_64:
1232 case SCMP_ARCH_X32:
1233 case SCMP_ARCH_AARCH64:
1234 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1235 shmat_syscall = SCMP_SYS(shmat);
1236 break;
1237
1238 /* Please add more definitions here, if you port systemd to other architectures! */
1239
1240 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1241 #warning "Consider adding the right mmap() syscall definitions here!"
1242 #endif
1243 }
1244
1245 /* Can't filter mmap() on this arch, then skip it */
1246 if (filter_syscall == 0)
1247 continue;
1248
1249 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1250 if (r < 0)
1251 return r;
1252
1253 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1254 1,
1255 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1256 if (r < 0)
1257 continue;
1258
1259 if (block_syscall != 0) {
1260 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1261 if (r < 0)
1262 continue;
1263 }
1264
1265 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1266 1,
1267 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1268 if (r < 0)
1269 continue;
1270
1271 if (shmat_syscall != 0) {
1272 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1273 1,
1274 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1275 if (r < 0)
1276 continue;
1277 }
1278
1279 r = seccomp_load(seccomp);
1280 if (IN_SET(r, -EPERM, -EACCES))
1281 return r;
1282 if (r < 0)
1283 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1284 }
1285
1286 return 0;
1287 }
1288
1289 int seccomp_restrict_archs(Set *archs) {
1290 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1291 Iterator i;
1292 void *id;
1293 int r;
1294
1295 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1296 * list. */
1297
1298 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1299 if (!seccomp)
1300 return -ENOMEM;
1301
1302 SET_FOREACH(id, archs, i) {
1303 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1304 if (r == -EEXIST)
1305 continue;
1306 if (r < 0)
1307 return r;
1308 }
1309
1310 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1311 if (r < 0)
1312 return r;
1313
1314 return seccomp_load(seccomp);
1315 }