]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #6636 from sourcejedi/fsync
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <linux/seccomp.h>
22 #include <seccomp.h>
23 #include <stddef.h>
24 #include <sys/mman.h>
25 #include <sys/prctl.h>
26 #include <sys/shm.h>
27
28 #include "af-list.h"
29 #include "alloc-util.h"
30 #include "macro.h"
31 #include "nsflags.h"
32 #include "seccomp-util.h"
33 #include "set.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "util.h"
37 #include "errno-list.h"
38
39 const uint32_t seccomp_local_archs[] = {
40
41 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
42
43 #if defined(__x86_64__) && defined(__ILP32__)
44 SCMP_ARCH_X86,
45 SCMP_ARCH_X86_64,
46 SCMP_ARCH_X32, /* native */
47 #elif defined(__x86_64__) && !defined(__ILP32__)
48 SCMP_ARCH_X86,
49 SCMP_ARCH_X32,
50 SCMP_ARCH_X86_64, /* native */
51 #elif defined(__i386__)
52 SCMP_ARCH_X86,
53 #elif defined(__aarch64__)
54 SCMP_ARCH_ARM,
55 SCMP_ARCH_AARCH64, /* native */
56 #elif defined(__arm__)
57 SCMP_ARCH_ARM,
58 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS, /* native */
61 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
62 SCMP_ARCH_MIPS,
63 SCMP_ARCH_MIPSEL, /* native */
64 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
67 SCMP_ARCH_MIPSEL64N32,
68 SCMP_ARCH_MIPS64N32,
69 SCMP_ARCH_MIPSEL64,
70 SCMP_ARCH_MIPS64, /* native */
71 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64N32,
75 SCMP_ARCH_MIPSEL64N32,
76 SCMP_ARCH_MIPS64,
77 SCMP_ARCH_MIPSEL64, /* native */
78 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
79 SCMP_ARCH_MIPSEL,
80 SCMP_ARCH_MIPS,
81 SCMP_ARCH_MIPSEL64,
82 SCMP_ARCH_MIPS64,
83 SCMP_ARCH_MIPSEL64N32,
84 SCMP_ARCH_MIPS64N32, /* native */
85 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
86 SCMP_ARCH_MIPS,
87 SCMP_ARCH_MIPSEL,
88 SCMP_ARCH_MIPS64,
89 SCMP_ARCH_MIPSEL64,
90 SCMP_ARCH_MIPS64N32,
91 SCMP_ARCH_MIPSEL64N32, /* native */
92 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
93 SCMP_ARCH_PPC,
94 SCMP_ARCH_PPC64LE,
95 SCMP_ARCH_PPC64, /* native */
96 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
97 SCMP_ARCH_PPC,
98 SCMP_ARCH_PPC64,
99 SCMP_ARCH_PPC64LE, /* native */
100 #elif defined(__powerpc__)
101 SCMP_ARCH_PPC,
102 #elif defined(__s390x__)
103 SCMP_ARCH_S390,
104 SCMP_ARCH_S390X, /* native */
105 #elif defined(__s390__)
106 SCMP_ARCH_S390,
107 #endif
108 (uint32_t) -1
109 };
110
111 const char* seccomp_arch_to_string(uint32_t c) {
112 /* Maintain order used in <seccomp.h>.
113 *
114 * Names used here should be the same as those used for ConditionArchitecture=,
115 * except for "subarchitectures" like x32. */
116
117 switch(c) {
118 case SCMP_ARCH_NATIVE:
119 return "native";
120 case SCMP_ARCH_X86:
121 return "x86";
122 case SCMP_ARCH_X86_64:
123 return "x86-64";
124 case SCMP_ARCH_X32:
125 return "x32";
126 case SCMP_ARCH_ARM:
127 return "arm";
128 case SCMP_ARCH_AARCH64:
129 return "arm64";
130 case SCMP_ARCH_MIPS:
131 return "mips";
132 case SCMP_ARCH_MIPS64:
133 return "mips64";
134 case SCMP_ARCH_MIPS64N32:
135 return "mips64-n32";
136 case SCMP_ARCH_MIPSEL:
137 return "mips-le";
138 case SCMP_ARCH_MIPSEL64:
139 return "mips64-le";
140 case SCMP_ARCH_MIPSEL64N32:
141 return "mips64-le-n32";
142 case SCMP_ARCH_PPC:
143 return "ppc";
144 case SCMP_ARCH_PPC64:
145 return "ppc64";
146 case SCMP_ARCH_PPC64LE:
147 return "ppc64-le";
148 case SCMP_ARCH_S390:
149 return "s390";
150 case SCMP_ARCH_S390X:
151 return "s390x";
152 default:
153 return NULL;
154 }
155 }
156
157 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
158 if (!n)
159 return -EINVAL;
160
161 assert(ret);
162
163 if (streq(n, "native"))
164 *ret = SCMP_ARCH_NATIVE;
165 else if (streq(n, "x86"))
166 *ret = SCMP_ARCH_X86;
167 else if (streq(n, "x86-64"))
168 *ret = SCMP_ARCH_X86_64;
169 else if (streq(n, "x32"))
170 *ret = SCMP_ARCH_X32;
171 else if (streq(n, "arm"))
172 *ret = SCMP_ARCH_ARM;
173 else if (streq(n, "arm64"))
174 *ret = SCMP_ARCH_AARCH64;
175 else if (streq(n, "mips"))
176 *ret = SCMP_ARCH_MIPS;
177 else if (streq(n, "mips64"))
178 *ret = SCMP_ARCH_MIPS64;
179 else if (streq(n, "mips64-n32"))
180 *ret = SCMP_ARCH_MIPS64N32;
181 else if (streq(n, "mips-le"))
182 *ret = SCMP_ARCH_MIPSEL;
183 else if (streq(n, "mips64-le"))
184 *ret = SCMP_ARCH_MIPSEL64;
185 else if (streq(n, "mips64-le-n32"))
186 *ret = SCMP_ARCH_MIPSEL64N32;
187 else if (streq(n, "ppc"))
188 *ret = SCMP_ARCH_PPC;
189 else if (streq(n, "ppc64"))
190 *ret = SCMP_ARCH_PPC64;
191 else if (streq(n, "ppc64-le"))
192 *ret = SCMP_ARCH_PPC64LE;
193 else if (streq(n, "s390"))
194 *ret = SCMP_ARCH_S390;
195 else if (streq(n, "s390x"))
196 *ret = SCMP_ARCH_S390X;
197 else
198 return -EINVAL;
199
200 return 0;
201 }
202
203 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
204 scmp_filter_ctx seccomp;
205 int r;
206
207 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
208 * any others. Also, turns off the NNP fiddling. */
209
210 seccomp = seccomp_init(default_action);
211 if (!seccomp)
212 return -ENOMEM;
213
214 if (arch != SCMP_ARCH_NATIVE &&
215 arch != seccomp_arch_native()) {
216
217 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
218 if (r < 0)
219 goto finish;
220
221 r = seccomp_arch_add(seccomp, arch);
222 if (r < 0)
223 goto finish;
224
225 assert(seccomp_arch_exist(seccomp, arch) >= 0);
226 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
227 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
228 } else {
229 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
230 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
231 }
232
233 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
234 if (r < 0)
235 goto finish;
236
237 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
238 if (r < 0)
239 goto finish;
240
241 *ret = seccomp;
242 return 0;
243
244 finish:
245 seccomp_release(seccomp);
246 return r;
247 }
248
249 static bool is_basic_seccomp_available(void) {
250 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
251 }
252
253 static bool is_seccomp_filter_available(void) {
254 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
255 errno == EFAULT;
256 }
257
258 bool is_seccomp_available(void) {
259 static int cached_enabled = -1;
260
261 if (cached_enabled < 0)
262 cached_enabled =
263 is_basic_seccomp_available() &&
264 is_seccomp_filter_available();
265
266 return cached_enabled;
267 }
268
269 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
270 [SYSCALL_FILTER_SET_DEFAULT] = {
271 .name = "@default",
272 .help = "System calls that are always permitted",
273 .value =
274 "clock_getres\0"
275 "clock_gettime\0"
276 "clock_nanosleep\0"
277 "execve\0"
278 "exit\0"
279 "exit_group\0"
280 "getrlimit\0" /* make sure processes can query stack size and such */
281 "gettimeofday\0"
282 "nanosleep\0"
283 "pause\0"
284 "rt_sigreturn\0"
285 "sigreturn\0"
286 "time\0"
287 },
288 [SYSCALL_FILTER_SET_BASIC_IO] = {
289 .name = "@basic-io",
290 .help = "Basic IO",
291 .value =
292 "close\0"
293 "dup2\0"
294 "dup3\0"
295 "dup\0"
296 "lseek\0"
297 "pread64\0"
298 "preadv\0"
299 "pwrite64\0"
300 "pwritev\0"
301 "read\0"
302 "readv\0"
303 "write\0"
304 "writev\0"
305 },
306 [SYSCALL_FILTER_SET_CLOCK] = {
307 .name = "@clock",
308 .help = "Change the system time",
309 .value =
310 "adjtimex\0"
311 "clock_adjtime\0"
312 "clock_settime\0"
313 "settimeofday\0"
314 "stime\0"
315 },
316 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
317 .name = "@cpu-emulation",
318 .help = "System calls for CPU emulation functionality",
319 .value =
320 "modify_ldt\0"
321 "subpage_prot\0"
322 "switch_endian\0"
323 "vm86\0"
324 "vm86old\0"
325 },
326 [SYSCALL_FILTER_SET_DEBUG] = {
327 .name = "@debug",
328 .help = "Debugging, performance monitoring and tracing functionality",
329 .value =
330 "lookup_dcookie\0"
331 "perf_event_open\0"
332 "process_vm_readv\0"
333 "process_vm_writev\0"
334 "ptrace\0"
335 "rtas\0"
336 #ifdef __NR_s390_runtime_instr
337 "s390_runtime_instr\0"
338 #endif
339 "sys_debug_setcontext\0"
340 },
341 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
342 .name = "@file-system",
343 .help = "File system operations",
344 .value =
345 "access\0"
346 "chdir\0"
347 "chmod\0"
348 "close\0"
349 "creat\0"
350 "faccessat\0"
351 "fallocate\0"
352 "fchdir\0"
353 "fchmod\0"
354 "fchmodat\0"
355 "fcntl64\0"
356 "fcntl\0"
357 "fgetxattr\0"
358 "flistxattr\0"
359 "fsetxattr\0"
360 "fstat64\0"
361 "fstat\0"
362 "fstatat64\0"
363 "fstatfs64\0"
364 "fstatfs\0"
365 "ftruncate64\0"
366 "ftruncate\0"
367 "futimesat\0"
368 "getcwd\0"
369 "getdents64\0"
370 "getdents\0"
371 "getxattr\0"
372 "inotify_add_watch\0"
373 "inotify_init1\0"
374 "inotify_rm_watch\0"
375 "lgetxattr\0"
376 "link\0"
377 "linkat\0"
378 "listxattr\0"
379 "llistxattr\0"
380 "lremovexattr\0"
381 "lsetxattr\0"
382 "lstat64\0"
383 "lstat\0"
384 "mkdir\0"
385 "mkdirat\0"
386 "mknod\0"
387 "mknodat\0"
388 "mmap2\0"
389 "mmap\0"
390 "munmap\0"
391 "newfstatat\0"
392 "open\0"
393 "openat\0"
394 "readlink\0"
395 "readlinkat\0"
396 "removexattr\0"
397 "rename\0"
398 "renameat2\0"
399 "renameat\0"
400 "rmdir\0"
401 "setxattr\0"
402 "stat64\0"
403 "stat\0"
404 "statfs\0"
405 "symlink\0"
406 "symlinkat\0"
407 "truncate64\0"
408 "truncate\0"
409 "unlink\0"
410 "unlinkat\0"
411 "utimensat\0"
412 "utimes\0"
413 },
414 [SYSCALL_FILTER_SET_IO_EVENT] = {
415 .name = "@io-event",
416 .help = "Event loop system calls",
417 .value =
418 "_newselect\0"
419 "epoll_create1\0"
420 "epoll_create\0"
421 "epoll_ctl\0"
422 "epoll_ctl_old\0"
423 "epoll_pwait\0"
424 "epoll_wait\0"
425 "epoll_wait_old\0"
426 "eventfd2\0"
427 "eventfd\0"
428 "poll\0"
429 "ppoll\0"
430 "pselect6\0"
431 "select\0"
432 },
433 [SYSCALL_FILTER_SET_IPC] = {
434 .name = "@ipc",
435 .help = "SysV IPC, POSIX Message Queues or other IPC",
436 .value =
437 "ipc\0"
438 "memfd_create\0"
439 "mq_getsetattr\0"
440 "mq_notify\0"
441 "mq_open\0"
442 "mq_timedreceive\0"
443 "mq_timedsend\0"
444 "mq_unlink\0"
445 "msgctl\0"
446 "msgget\0"
447 "msgrcv\0"
448 "msgsnd\0"
449 "pipe2\0"
450 "pipe\0"
451 "process_vm_readv\0"
452 "process_vm_writev\0"
453 "semctl\0"
454 "semget\0"
455 "semop\0"
456 "semtimedop\0"
457 "shmat\0"
458 "shmctl\0"
459 "shmdt\0"
460 "shmget\0"
461 },
462 [SYSCALL_FILTER_SET_KEYRING] = {
463 .name = "@keyring",
464 .help = "Kernel keyring access",
465 .value =
466 "add_key\0"
467 "keyctl\0"
468 "request_key\0"
469 },
470 [SYSCALL_FILTER_SET_MODULE] = {
471 .name = "@module",
472 .help = "Loading and unloading of kernel modules",
473 .value =
474 "delete_module\0"
475 "finit_module\0"
476 "init_module\0"
477 },
478 [SYSCALL_FILTER_SET_MOUNT] = {
479 .name = "@mount",
480 .help = "Mounting and unmounting of file systems",
481 .value =
482 "chroot\0"
483 "mount\0"
484 "pivot_root\0"
485 "umount2\0"
486 "umount\0"
487 },
488 [SYSCALL_FILTER_SET_NETWORK_IO] = {
489 .name = "@network-io",
490 .help = "Network or Unix socket IO, should not be needed if not network facing",
491 .value =
492 "accept4\0"
493 "accept\0"
494 "bind\0"
495 "connect\0"
496 "getpeername\0"
497 "getsockname\0"
498 "getsockopt\0"
499 "listen\0"
500 "recv\0"
501 "recvfrom\0"
502 "recvmmsg\0"
503 "recvmsg\0"
504 "send\0"
505 "sendmmsg\0"
506 "sendmsg\0"
507 "sendto\0"
508 "setsockopt\0"
509 "shutdown\0"
510 "socket\0"
511 "socketcall\0"
512 "socketpair\0"
513 },
514 [SYSCALL_FILTER_SET_OBSOLETE] = {
515 /* some unknown even to libseccomp */
516 .name = "@obsolete",
517 .help = "Unusual, obsolete or unimplemented system calls",
518 .value =
519 "_sysctl\0"
520 "afs_syscall\0"
521 "bdflush\0"
522 "break\0"
523 "create_module\0"
524 "ftime\0"
525 "get_kernel_syms\0"
526 "getpmsg\0"
527 "gtty\0"
528 "lock\0"
529 "mpx\0"
530 "prof\0"
531 "profil\0"
532 "putpmsg\0"
533 "query_module\0"
534 "security\0"
535 "sgetmask\0"
536 "ssetmask\0"
537 "stty\0"
538 "sysfs\0"
539 "tuxcall\0"
540 "ulimit\0"
541 "uselib\0"
542 "ustat\0"
543 "vserver\0"
544 },
545 [SYSCALL_FILTER_SET_PRIVILEGED] = {
546 .name = "@privileged",
547 .help = "All system calls which need super-user capabilities",
548 .value =
549 "@clock\0"
550 "@module\0"
551 "@raw-io\0"
552 "acct\0"
553 "bpf\0"
554 "capset\0"
555 "chown32\0"
556 "chown\0"
557 "chroot\0"
558 "fchown32\0"
559 "fchown\0"
560 "fchownat\0"
561 "kexec_file_load\0"
562 "kexec_load\0"
563 "lchown32\0"
564 "lchown\0"
565 "nfsservctl\0"
566 "pivot_root\0"
567 "quotactl\0"
568 "reboot\0"
569 "setdomainname\0"
570 "setfsuid32\0"
571 "setfsuid\0"
572 "setgroups32\0"
573 "setgroups\0"
574 "sethostname\0"
575 "setresuid32\0"
576 "setresuid\0"
577 "setreuid32\0"
578 "setreuid\0"
579 "setuid32\0"
580 "setuid\0"
581 "swapoff\0"
582 "swapon\0"
583 "_sysctl\0"
584 "vhangup\0"
585 },
586 [SYSCALL_FILTER_SET_PROCESS] = {
587 .name = "@process",
588 .help = "Process control, execution, namespaceing operations",
589 .value =
590 "arch_prctl\0"
591 "clone\0"
592 "execveat\0"
593 "fork\0"
594 "kill\0"
595 "prctl\0"
596 "setns\0"
597 "tgkill\0"
598 "tkill\0"
599 "unshare\0"
600 "vfork\0"
601 },
602 [SYSCALL_FILTER_SET_RAW_IO] = {
603 .name = "@raw-io",
604 .help = "Raw I/O port access",
605 .value =
606 "ioperm\0"
607 "iopl\0"
608 "pciconfig_iobase\0"
609 "pciconfig_read\0"
610 "pciconfig_write\0"
611 #ifdef __NR_s390_pci_mmio_read
612 "s390_pci_mmio_read\0"
613 #endif
614 #ifdef __NR_s390_pci_mmio_write
615 "s390_pci_mmio_write\0"
616 #endif
617 },
618 [SYSCALL_FILTER_SET_REBOOT] = {
619 .name = "@reboot",
620 .help = "Reboot and reboot preparation/kexec",
621 .value =
622 "kexec\0"
623 "kexec_file_load\0"
624 "reboot\0"
625 },
626 [SYSCALL_FILTER_SET_RESOURCES] = {
627 .name = "@resources",
628 .help = "Alter resource settings",
629 .value =
630 "sched_setparam\0"
631 "sched_setscheduler\0"
632 "sched_setaffinity\0"
633 "setpriority\0"
634 "setrlimit\0"
635 "set_mempolicy\0"
636 "migrate_pages\0"
637 "move_pages\0"
638 "mbind\0"
639 "sched_setattr\0"
640 "prlimit64\0"
641 },
642 [SYSCALL_FILTER_SET_SETUID] = {
643 .name = "@setuid",
644 .help = "Operations for changing user/group credentials",
645 .value =
646 "setgid32\0"
647 "setgid\0"
648 "setgroups32\0"
649 "setgroups\0"
650 "setregid32\0"
651 "setregid\0"
652 "setresgid32\0"
653 "setresgid\0"
654 "setresuid32\0"
655 "setresuid\0"
656 "setreuid32\0"
657 "setreuid\0"
658 "setuid32\0"
659 "setuid\0"
660 },
661 [SYSCALL_FILTER_SET_SWAP] = {
662 .name = "@swap",
663 .help = "Enable/disable swap devices",
664 .value =
665 "swapoff\0"
666 "swapon\0"
667 },
668 };
669
670 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
671 unsigned i;
672
673 if (isempty(name) || name[0] != '@')
674 return NULL;
675
676 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
677 if (streq(syscall_filter_sets[i].name, name))
678 return syscall_filter_sets + i;
679
680 return NULL;
681 }
682
683 static int seccomp_add_syscall_filter_set(
684 scmp_filter_ctx seccomp,
685 uint32_t default_action,
686 const SyscallFilterSet *set,
687 uint32_t action) {
688
689 const char *sys;
690 int r;
691
692 assert(seccomp);
693 assert(set);
694
695 NULSTR_FOREACH(sys, set->value) {
696 int id;
697
698 if (sys[0] == '@') {
699 const SyscallFilterSet *other;
700
701 other = syscall_filter_set_find(sys);
702 if (!other)
703 return -EINVAL;
704
705 r = seccomp_add_syscall_filter_set(seccomp, default_action, other, action);
706 if (r < 0)
707 return r;
708 } else {
709 id = seccomp_syscall_resolve_name(sys);
710 if (id == __NR_SCMP_ERROR)
711 return -EINVAL; /* Not known at all? Then that's a real error */
712
713 r = seccomp_rule_add_exact(seccomp, action, id, 0);
714 if (r < 0)
715 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
716 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", sys, id);
717 }
718 }
719
720 return 0;
721 }
722
723 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
724 uint32_t arch;
725 int r;
726
727 assert(set);
728
729 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
730 * earch local arch. */
731
732 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
733 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
734
735 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
736
737 r = seccomp_init_for_arch(&seccomp, arch, default_action);
738 if (r < 0)
739 return r;
740
741 r = seccomp_add_syscall_filter_set(seccomp, default_action, set, action);
742 if (r < 0) {
743 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
744 continue;
745 }
746
747 r = seccomp_load(seccomp);
748 if (IN_SET(r, -EPERM, -EACCES))
749 return r;
750 if (r < 0)
751 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
752 }
753
754 return 0;
755 }
756
757 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
758 uint32_t arch;
759 int r;
760
761 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
762 * SyscallFilterSet* table. */
763
764 if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
765 return 0;
766
767 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
768 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
769 Iterator i;
770 void *id;
771
772 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
773
774 r = seccomp_init_for_arch(&seccomp, arch, default_action);
775 if (r < 0)
776 return r;
777
778 SET_FOREACH(id, set, i) {
779 r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
780 if (r < 0) {
781 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
782 _cleanup_free_ char *n = NULL;
783
784 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
785 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
786 }
787 }
788
789 r = seccomp_load(seccomp);
790 if (IN_SET(r, -EPERM, -EACCES))
791 return r;
792 if (r < 0)
793 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
794 }
795
796 return 0;
797 }
798
799 int seccomp_restrict_namespaces(unsigned long retain) {
800 uint32_t arch;
801 int r;
802
803 if (log_get_max_level() >= LOG_DEBUG) {
804 _cleanup_free_ char *s = NULL;
805
806 (void) namespace_flag_to_string_many(retain, &s);
807 log_debug("Restricting namespace to: %s.", strna(s));
808 }
809
810 /* NOOP? */
811 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
812 return 0;
813
814 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
815 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
816 unsigned i;
817
818 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
819
820 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
821 if (r < 0)
822 return r;
823
824 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
825 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
826 * altogether. */
827 r = seccomp_rule_add_exact(
828 seccomp,
829 SCMP_ACT_ERRNO(EPERM),
830 SCMP_SYS(setns),
831 0);
832 else
833 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
834 * special invocation with a zero flags argument, right here. */
835 r = seccomp_rule_add_exact(
836 seccomp,
837 SCMP_ACT_ERRNO(EPERM),
838 SCMP_SYS(setns),
839 1,
840 SCMP_A1(SCMP_CMP_EQ, 0));
841 if (r < 0) {
842 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
843 continue;
844 }
845
846 for (i = 0; namespace_flag_map[i].name; i++) {
847 unsigned long f;
848
849 f = namespace_flag_map[i].flag;
850 if ((retain & f) == f) {
851 log_debug("Permitting %s.", namespace_flag_map[i].name);
852 continue;
853 }
854
855 log_debug("Blocking %s.", namespace_flag_map[i].name);
856
857 r = seccomp_rule_add_exact(
858 seccomp,
859 SCMP_ACT_ERRNO(EPERM),
860 SCMP_SYS(unshare),
861 1,
862 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
863 if (r < 0) {
864 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
865 break;
866 }
867
868 /* On s390/s390x the first two parameters to clone are switched */
869 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
870 r = seccomp_rule_add_exact(
871 seccomp,
872 SCMP_ACT_ERRNO(EPERM),
873 SCMP_SYS(clone),
874 1,
875 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
876 else
877 r = seccomp_rule_add_exact(
878 seccomp,
879 SCMP_ACT_ERRNO(EPERM),
880 SCMP_SYS(clone),
881 1,
882 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
883 if (r < 0) {
884 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
885 break;
886 }
887
888 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
889 r = seccomp_rule_add_exact(
890 seccomp,
891 SCMP_ACT_ERRNO(EPERM),
892 SCMP_SYS(setns),
893 1,
894 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
895 if (r < 0) {
896 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
897 break;
898 }
899 }
900 }
901 if (r < 0)
902 continue;
903
904 r = seccomp_load(seccomp);
905 if (IN_SET(r, -EPERM, -EACCES))
906 return r;
907 if (r < 0)
908 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
909 }
910
911 return 0;
912 }
913
914 int seccomp_protect_sysctl(void) {
915 uint32_t arch;
916 int r;
917
918 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
919 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
920
921 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
922
923 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
924 /* No _sysctl syscall */
925 continue;
926
927 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
928 if (r < 0)
929 return r;
930
931 r = seccomp_rule_add_exact(
932 seccomp,
933 SCMP_ACT_ERRNO(EPERM),
934 SCMP_SYS(_sysctl),
935 0);
936 if (r < 0) {
937 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
938 continue;
939 }
940
941 r = seccomp_load(seccomp);
942 if (IN_SET(r, -EPERM, -EACCES))
943 return r;
944 if (r < 0)
945 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
946 }
947
948 return 0;
949 }
950
951 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
952 uint32_t arch;
953 int r;
954
955 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
956 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
957 bool supported;
958 Iterator i;
959
960 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
961
962 switch (arch) {
963
964 case SCMP_ARCH_X86_64:
965 case SCMP_ARCH_X32:
966 case SCMP_ARCH_ARM:
967 case SCMP_ARCH_AARCH64:
968 case SCMP_ARCH_PPC64:
969 case SCMP_ARCH_PPC64LE:
970 /* These we know we support (i.e. are the ones that do not use socketcall()) */
971 supported = true;
972 break;
973
974 case SCMP_ARCH_S390:
975 case SCMP_ARCH_S390X:
976 case SCMP_ARCH_PPC:
977 case SCMP_ARCH_X86:
978 default:
979 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
980 * don't know */
981 supported = false;
982 break;
983 }
984
985 if (!supported)
986 continue;
987
988 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
989 if (r < 0)
990 return r;
991
992 if (whitelist) {
993 int af, first = 0, last = 0;
994 void *afp;
995
996 /* If this is a whitelist, we first block the address families that are out of range and then
997 * everything that is not in the set. First, we find the lowest and highest address family in
998 * the set. */
999
1000 SET_FOREACH(afp, address_families, i) {
1001 af = PTR_TO_INT(afp);
1002
1003 if (af <= 0 || af >= af_max())
1004 continue;
1005
1006 if (first == 0 || af < first)
1007 first = af;
1008
1009 if (last == 0 || af > last)
1010 last = af;
1011 }
1012
1013 assert((first == 0) == (last == 0));
1014
1015 if (first == 0) {
1016
1017 /* No entries in the valid range, block everything */
1018 r = seccomp_rule_add_exact(
1019 seccomp,
1020 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1021 SCMP_SYS(socket),
1022 0);
1023 if (r < 0) {
1024 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1025 continue;
1026 }
1027
1028 } else {
1029
1030 /* Block everything below the first entry */
1031 r = seccomp_rule_add_exact(
1032 seccomp,
1033 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1034 SCMP_SYS(socket),
1035 1,
1036 SCMP_A0(SCMP_CMP_LT, first));
1037 if (r < 0) {
1038 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1039 continue;
1040 }
1041
1042 /* Block everything above the last entry */
1043 r = seccomp_rule_add_exact(
1044 seccomp,
1045 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1046 SCMP_SYS(socket),
1047 1,
1048 SCMP_A0(SCMP_CMP_GT, last));
1049 if (r < 0) {
1050 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1051 continue;
1052 }
1053
1054 /* Block everything between the first and last entry */
1055 for (af = 1; af < af_max(); af++) {
1056
1057 if (set_contains(address_families, INT_TO_PTR(af)))
1058 continue;
1059
1060 r = seccomp_rule_add_exact(
1061 seccomp,
1062 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1063 SCMP_SYS(socket),
1064 1,
1065 SCMP_A0(SCMP_CMP_EQ, af));
1066 if (r < 0)
1067 break;
1068 }
1069
1070 if (r < 0) {
1071 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1072 continue;
1073 }
1074 }
1075
1076 } else {
1077 void *af;
1078
1079 /* If this is a blacklist, then generate one rule for
1080 * each address family that are then combined in OR
1081 * checks. */
1082
1083 SET_FOREACH(af, address_families, i) {
1084
1085 r = seccomp_rule_add_exact(
1086 seccomp,
1087 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1088 SCMP_SYS(socket),
1089 1,
1090 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1091 if (r < 0)
1092 break;
1093 }
1094
1095 if (r < 0) {
1096 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1097 continue;
1098 }
1099 }
1100
1101 r = seccomp_load(seccomp);
1102 if (IN_SET(r, -EPERM, -EACCES))
1103 return r;
1104 if (r < 0)
1105 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1106 }
1107
1108 return 0;
1109 }
1110
1111 int seccomp_restrict_realtime(void) {
1112 static const int permitted_policies[] = {
1113 SCHED_OTHER,
1114 SCHED_BATCH,
1115 SCHED_IDLE,
1116 };
1117
1118 int r, max_policy = 0;
1119 uint32_t arch;
1120 unsigned i;
1121
1122 /* Determine the highest policy constant we want to allow */
1123 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1124 if (permitted_policies[i] > max_policy)
1125 max_policy = permitted_policies[i];
1126
1127 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1128 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1129 int p;
1130
1131 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1132
1133 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1134 if (r < 0)
1135 return r;
1136
1137 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1138 * whitelist. */
1139 for (p = 0; p < max_policy; p++) {
1140 bool good = false;
1141
1142 /* Check if this is in the whitelist. */
1143 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1144 if (permitted_policies[i] == p) {
1145 good = true;
1146 break;
1147 }
1148
1149 if (good)
1150 continue;
1151
1152 /* Deny this policy */
1153 r = seccomp_rule_add_exact(
1154 seccomp,
1155 SCMP_ACT_ERRNO(EPERM),
1156 SCMP_SYS(sched_setscheduler),
1157 1,
1158 SCMP_A1(SCMP_CMP_EQ, p));
1159 if (r < 0) {
1160 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1161 continue;
1162 }
1163 }
1164
1165 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1166 * unsigned here, hence no need no check for < 0 values. */
1167 r = seccomp_rule_add_exact(
1168 seccomp,
1169 SCMP_ACT_ERRNO(EPERM),
1170 SCMP_SYS(sched_setscheduler),
1171 1,
1172 SCMP_A1(SCMP_CMP_GT, max_policy));
1173 if (r < 0) {
1174 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1175 continue;
1176 }
1177
1178 r = seccomp_load(seccomp);
1179 if (IN_SET(r, -EPERM, -EACCES))
1180 return r;
1181 if (r < 0)
1182 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1183 }
1184
1185 return 0;
1186 }
1187
1188 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1189 uint32_t arch,
1190 int nr,
1191 unsigned int arg_cnt,
1192 const struct scmp_arg_cmp arg) {
1193 int r;
1194
1195 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1196 if (r < 0) {
1197 _cleanup_free_ char *n = NULL;
1198
1199 n = seccomp_syscall_resolve_num_arch(arch, nr);
1200 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1201 strna(n),
1202 seccomp_arch_to_string(arch));
1203 }
1204
1205 return r;
1206 }
1207
1208 /* For known architectures, check that syscalls are indeed defined or not. */
1209 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1210 assert_cc(SCMP_SYS(shmget) > 0);
1211 assert_cc(SCMP_SYS(shmat) > 0);
1212 assert_cc(SCMP_SYS(shmdt) > 0);
1213 #elif defined(__i386__) || defined(__powerpc64__)
1214 assert_cc(SCMP_SYS(shmget) < 0);
1215 assert_cc(SCMP_SYS(shmat) < 0);
1216 assert_cc(SCMP_SYS(shmdt) < 0);
1217 #endif
1218
1219 int seccomp_memory_deny_write_execute(void) {
1220
1221 uint32_t arch;
1222 int r;
1223
1224 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1225 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1226 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1227
1228 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1229
1230 switch (arch) {
1231
1232 case SCMP_ARCH_X86:
1233 filter_syscall = SCMP_SYS(mmap2);
1234 block_syscall = SCMP_SYS(mmap);
1235 break;
1236
1237 case SCMP_ARCH_PPC64:
1238 case SCMP_ARCH_PPC64LE:
1239 filter_syscall = SCMP_SYS(mmap);
1240
1241 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1242 * We ignore that here, which means there's still a way to get writable/executable
1243 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1244
1245 break;
1246
1247 case SCMP_ARCH_ARM:
1248 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1249 shmat_syscall = SCMP_SYS(shmat);
1250 break;
1251
1252 case SCMP_ARCH_X86_64:
1253 case SCMP_ARCH_X32:
1254 case SCMP_ARCH_AARCH64:
1255 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1256 shmat_syscall = SCMP_SYS(shmat);
1257 break;
1258
1259 /* Please add more definitions here, if you port systemd to other architectures! */
1260
1261 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1262 #warning "Consider adding the right mmap() syscall definitions here!"
1263 #endif
1264 }
1265
1266 /* Can't filter mmap() on this arch, then skip it */
1267 if (filter_syscall == 0)
1268 continue;
1269
1270 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1271 if (r < 0)
1272 return r;
1273
1274 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1275 1,
1276 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1277 if (r < 0)
1278 continue;
1279
1280 if (block_syscall != 0) {
1281 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1282 if (r < 0)
1283 continue;
1284 }
1285
1286 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1287 1,
1288 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1289 if (r < 0)
1290 continue;
1291
1292 if (shmat_syscall != 0) {
1293 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1294 1,
1295 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1296 if (r < 0)
1297 continue;
1298 }
1299
1300 r = seccomp_load(seccomp);
1301 if (IN_SET(r, -EPERM, -EACCES))
1302 return r;
1303 if (r < 0)
1304 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1305 }
1306
1307 return 0;
1308 }
1309
1310 int seccomp_restrict_archs(Set *archs) {
1311 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1312 Iterator i;
1313 void *id;
1314 int r;
1315
1316 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1317 * list. */
1318
1319 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1320 if (!seccomp)
1321 return -ENOMEM;
1322
1323 SET_FOREACH(id, archs, i) {
1324 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1325 if (r == -EEXIST)
1326 continue;
1327 if (r < 0)
1328 return r;
1329 }
1330
1331 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1332 if (r < 0)
1333 return r;
1334
1335 return seccomp_load(seccomp);
1336 }
1337
1338 int parse_syscall_archs(char **l, Set **archs) {
1339 _cleanup_set_free_ Set *_archs;
1340 char **s;
1341 int r;
1342
1343 assert(l);
1344 assert(archs);
1345
1346 r = set_ensure_allocated(&_archs, NULL);
1347 if (r < 0)
1348 return r;
1349
1350 STRV_FOREACH(s, l) {
1351 uint32_t a;
1352
1353 r = seccomp_arch_from_string(*s, &a);
1354 if (r < 0)
1355 return -EINVAL;
1356
1357 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1358 if (r < 0)
1359 return -ENOMEM;
1360 }
1361
1362 *archs = _archs;
1363 _archs = NULL;
1364
1365 return 0;
1366 }
1367
1368 int seccomp_filter_set_add(Set *filter, bool add, const SyscallFilterSet *set) {
1369 const char *i;
1370 int r;
1371
1372 assert(set);
1373
1374 NULSTR_FOREACH(i, set->value) {
1375
1376 if (i[0] == '@') {
1377 const SyscallFilterSet *more;
1378
1379 more = syscall_filter_set_find(i);
1380 if (!more)
1381 return -ENXIO;
1382
1383
1384 r = seccomp_filter_set_add(filter, add, more);
1385 if (r < 0)
1386 return r;
1387 } else {
1388 int id;
1389
1390 id = seccomp_syscall_resolve_name(i);
1391 if (id == __NR_SCMP_ERROR)
1392 return -ENXIO;
1393
1394 if (add) {
1395 r = set_put(filter, INT_TO_PTR(id + 1));
1396 if (r < 0)
1397 return r;
1398 } else
1399 (void) set_remove(filter, INT_TO_PTR(id + 1));
1400 }
1401 }
1402
1403 return 0;
1404 }