]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
seccomp: don't ever try to add an ABI before removing the default native ABI (#5230)
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <linux/seccomp.h>
22 #include <seccomp.h>
23 #include <stddef.h>
24 #include <sys/mman.h>
25 #include <sys/prctl.h>
26 #include <sys/shm.h>
27
28 #include "af-list.h"
29 #include "alloc-util.h"
30 #include "macro.h"
31 #include "nsflags.h"
32 #include "seccomp-util.h"
33 #include "string-util.h"
34 #include "util.h"
35 #include "errno-list.h"
36
37 const uint32_t seccomp_local_archs[] = {
38
39 #if defined(__i386__) || defined(__x86_64__)
40 SCMP_ARCH_X86,
41 SCMP_ARCH_X86_64,
42 SCMP_ARCH_X32,
43
44 #elif defined(__arm__) || defined(__aarch64__)
45 SCMP_ARCH_ARM,
46 SCMP_ARCH_AARCH64,
47
48 #elif defined(__mips__) || defined(__mips64__)
49 SCMP_ARCH_MIPS,
50 SCMP_ARCH_MIPS64,
51 SCMP_ARCH_MIPS64N32,
52 SCMP_ARCH_MIPSEL,
53 SCMP_ARCH_MIPSEL64,
54 SCMP_ARCH_MIPSEL64N32,
55
56 #elif defined(__powerpc__) || defined(__powerpc64__)
57 SCMP_ARCH_PPC,
58 SCMP_ARCH_PPC64,
59 SCMP_ARCH_PPC64LE,
60
61 #elif defined(__s390__) || defined(__s390x__)
62 SCMP_ARCH_S390,
63 SCMP_ARCH_S390X,
64 #endif
65 (uint32_t) -1
66 };
67
68 const char* seccomp_arch_to_string(uint32_t c) {
69 /* Maintain order used in <seccomp.h>.
70 *
71 * Names used here should be the same as those used for ConditionArchitecture=,
72 * except for "subarchitectures" like x32. */
73
74 switch(c) {
75 case SCMP_ARCH_NATIVE:
76 return "native";
77 case SCMP_ARCH_X86:
78 return "x86";
79 case SCMP_ARCH_X86_64:
80 return "x86-64";
81 case SCMP_ARCH_X32:
82 return "x32";
83 case SCMP_ARCH_ARM:
84 return "arm";
85 case SCMP_ARCH_AARCH64:
86 return "arm64";
87 case SCMP_ARCH_MIPS:
88 return "mips";
89 case SCMP_ARCH_MIPS64:
90 return "mips64";
91 case SCMP_ARCH_MIPS64N32:
92 return "mips64-n32";
93 case SCMP_ARCH_MIPSEL:
94 return "mips-le";
95 case SCMP_ARCH_MIPSEL64:
96 return "mips64-le";
97 case SCMP_ARCH_MIPSEL64N32:
98 return "mips64-le-n32";
99 case SCMP_ARCH_PPC:
100 return "ppc";
101 case SCMP_ARCH_PPC64:
102 return "ppc64";
103 case SCMP_ARCH_PPC64LE:
104 return "ppc64-le";
105 case SCMP_ARCH_S390:
106 return "s390";
107 case SCMP_ARCH_S390X:
108 return "s390x";
109 default:
110 return NULL;
111 }
112 }
113
114 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
115 if (!n)
116 return -EINVAL;
117
118 assert(ret);
119
120 if (streq(n, "native"))
121 *ret = SCMP_ARCH_NATIVE;
122 else if (streq(n, "x86"))
123 *ret = SCMP_ARCH_X86;
124 else if (streq(n, "x86-64"))
125 *ret = SCMP_ARCH_X86_64;
126 else if (streq(n, "x32"))
127 *ret = SCMP_ARCH_X32;
128 else if (streq(n, "arm"))
129 *ret = SCMP_ARCH_ARM;
130 else if (streq(n, "arm64"))
131 *ret = SCMP_ARCH_AARCH64;
132 else if (streq(n, "mips"))
133 *ret = SCMP_ARCH_MIPS;
134 else if (streq(n, "mips64"))
135 *ret = SCMP_ARCH_MIPS64;
136 else if (streq(n, "mips64-n32"))
137 *ret = SCMP_ARCH_MIPS64N32;
138 else if (streq(n, "mips-le"))
139 *ret = SCMP_ARCH_MIPSEL;
140 else if (streq(n, "mips64-le"))
141 *ret = SCMP_ARCH_MIPSEL64;
142 else if (streq(n, "mips64-le-n32"))
143 *ret = SCMP_ARCH_MIPSEL64N32;
144 else if (streq(n, "ppc"))
145 *ret = SCMP_ARCH_PPC;
146 else if (streq(n, "ppc64"))
147 *ret = SCMP_ARCH_PPC64;
148 else if (streq(n, "ppc64-le"))
149 *ret = SCMP_ARCH_PPC64LE;
150 else if (streq(n, "s390"))
151 *ret = SCMP_ARCH_S390;
152 else if (streq(n, "s390x"))
153 *ret = SCMP_ARCH_S390X;
154 else
155 return -EINVAL;
156
157 return 0;
158 }
159
160 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
161 scmp_filter_ctx seccomp;
162 int r;
163
164 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
165 * any others. Also, turns off the NNP fiddling. */
166
167 seccomp = seccomp_init(default_action);
168 if (!seccomp)
169 return -ENOMEM;
170
171 if (arch != SCMP_ARCH_NATIVE &&
172 arch != seccomp_arch_native()) {
173
174 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
175 if (r < 0)
176 goto finish;
177
178 r = seccomp_arch_add(seccomp, arch);
179 if (r < 0)
180 goto finish;
181
182 assert(seccomp_arch_exist(seccomp, arch) >= 0);
183 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
184 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
185 } else {
186 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
187 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
188 }
189
190 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
191 if (r < 0)
192 goto finish;
193
194 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
195 if (r < 0)
196 goto finish;
197
198 *ret = seccomp;
199 return 0;
200
201 finish:
202 seccomp_release(seccomp);
203 return r;
204 }
205
206 static bool is_basic_seccomp_available(void) {
207 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
208 }
209
210 static bool is_seccomp_filter_available(void) {
211 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
212 errno == EFAULT;
213 }
214
215 bool is_seccomp_available(void) {
216 static int cached_enabled = -1;
217
218 if (cached_enabled < 0)
219 cached_enabled =
220 is_basic_seccomp_available() &&
221 is_seccomp_filter_available();
222
223 return cached_enabled;
224 }
225
226 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
227 [SYSCALL_FILTER_SET_DEFAULT] = {
228 .name = "@default",
229 .help = "System calls that are always permitted",
230 .value =
231 "clock_getres\0"
232 "clock_gettime\0"
233 "clock_nanosleep\0"
234 "execve\0"
235 "exit\0"
236 "exit_group\0"
237 "getrlimit\0" /* make sure processes can query stack size and such */
238 "gettimeofday\0"
239 "nanosleep\0"
240 "pause\0"
241 "rt_sigreturn\0"
242 "sigreturn\0"
243 "time\0"
244 },
245 [SYSCALL_FILTER_SET_BASIC_IO] = {
246 .name = "@basic-io",
247 .help = "Basic IO",
248 .value =
249 "close\0"
250 "dup2\0"
251 "dup3\0"
252 "dup\0"
253 "lseek\0"
254 "pread64\0"
255 "preadv\0"
256 "pwrite64\0"
257 "pwritev\0"
258 "read\0"
259 "readv\0"
260 "write\0"
261 "writev\0"
262 },
263 [SYSCALL_FILTER_SET_CLOCK] = {
264 .name = "@clock",
265 .help = "Change the system time",
266 .value =
267 "adjtimex\0"
268 "clock_adjtime\0"
269 "clock_settime\0"
270 "settimeofday\0"
271 "stime\0"
272 },
273 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
274 .name = "@cpu-emulation",
275 .help = "System calls for CPU emulation functionality",
276 .value =
277 "modify_ldt\0"
278 "subpage_prot\0"
279 "switch_endian\0"
280 "vm86\0"
281 "vm86old\0"
282 },
283 [SYSCALL_FILTER_SET_DEBUG] = {
284 .name = "@debug",
285 .help = "Debugging, performance monitoring and tracing functionality",
286 .value =
287 "lookup_dcookie\0"
288 "perf_event_open\0"
289 "process_vm_readv\0"
290 "process_vm_writev\0"
291 "ptrace\0"
292 "rtas\0"
293 #ifdef __NR_s390_runtime_instr
294 "s390_runtime_instr\0"
295 #endif
296 "sys_debug_setcontext\0"
297 },
298 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
299 .name = "@file-system",
300 .help = "File system operations",
301 .value =
302 "access\0"
303 "chdir\0"
304 "chmod\0"
305 "close\0"
306 "creat\0"
307 "faccessat\0"
308 "fallocate\0"
309 "fchdir\0"
310 "fchmod\0"
311 "fchmodat\0"
312 "fcntl64\0"
313 "fcntl\0"
314 "fgetxattr\0"
315 "flistxattr\0"
316 "fsetxattr\0"
317 "fstat64\0"
318 "fstat\0"
319 "fstatat64\0"
320 "fstatfs64\0"
321 "fstatfs\0"
322 "ftruncate64\0"
323 "ftruncate\0"
324 "futimesat\0"
325 "getcwd\0"
326 "getdents64\0"
327 "getdents\0"
328 "getxattr\0"
329 "inotify_add_watch\0"
330 "inotify_init1\0"
331 "inotify_rm_watch\0"
332 "lgetxattr\0"
333 "link\0"
334 "linkat\0"
335 "listxattr\0"
336 "llistxattr\0"
337 "lremovexattr\0"
338 "lsetxattr\0"
339 "lstat64\0"
340 "lstat\0"
341 "mkdir\0"
342 "mkdirat\0"
343 "mknod\0"
344 "mknodat\0"
345 "mmap2\0"
346 "mmap\0"
347 "newfstatat\0"
348 "open\0"
349 "openat\0"
350 "readlink\0"
351 "readlinkat\0"
352 "removexattr\0"
353 "rename\0"
354 "renameat2\0"
355 "renameat\0"
356 "rmdir\0"
357 "setxattr\0"
358 "stat64\0"
359 "stat\0"
360 "statfs\0"
361 "symlink\0"
362 "symlinkat\0"
363 "truncate64\0"
364 "truncate\0"
365 "unlink\0"
366 "unlinkat\0"
367 "utimensat\0"
368 "utimes\0"
369 },
370 [SYSCALL_FILTER_SET_IO_EVENT] = {
371 .name = "@io-event",
372 .help = "Event loop system calls",
373 .value =
374 "_newselect\0"
375 "epoll_create1\0"
376 "epoll_create\0"
377 "epoll_ctl\0"
378 "epoll_ctl_old\0"
379 "epoll_pwait\0"
380 "epoll_wait\0"
381 "epoll_wait_old\0"
382 "eventfd2\0"
383 "eventfd\0"
384 "poll\0"
385 "ppoll\0"
386 "pselect6\0"
387 "select\0"
388 },
389 [SYSCALL_FILTER_SET_IPC] = {
390 .name = "@ipc",
391 .help = "SysV IPC, POSIX Message Queues or other IPC",
392 .value =
393 "ipc\0"
394 "memfd_create\0"
395 "mq_getsetattr\0"
396 "mq_notify\0"
397 "mq_open\0"
398 "mq_timedreceive\0"
399 "mq_timedsend\0"
400 "mq_unlink\0"
401 "msgctl\0"
402 "msgget\0"
403 "msgrcv\0"
404 "msgsnd\0"
405 "pipe2\0"
406 "pipe\0"
407 "process_vm_readv\0"
408 "process_vm_writev\0"
409 "semctl\0"
410 "semget\0"
411 "semop\0"
412 "semtimedop\0"
413 "shmat\0"
414 "shmctl\0"
415 "shmdt\0"
416 "shmget\0"
417 },
418 [SYSCALL_FILTER_SET_KEYRING] = {
419 .name = "@keyring",
420 .help = "Kernel keyring access",
421 .value =
422 "add_key\0"
423 "keyctl\0"
424 "request_key\0"
425 },
426 [SYSCALL_FILTER_SET_MODULE] = {
427 .name = "@module",
428 .help = "Loading and unloading of kernel modules",
429 .value =
430 "delete_module\0"
431 "finit_module\0"
432 "init_module\0"
433 },
434 [SYSCALL_FILTER_SET_MOUNT] = {
435 .name = "@mount",
436 .help = "Mounting and unmounting of file systems",
437 .value =
438 "chroot\0"
439 "mount\0"
440 "pivot_root\0"
441 "umount2\0"
442 "umount\0"
443 },
444 [SYSCALL_FILTER_SET_NETWORK_IO] = {
445 .name = "@network-io",
446 .help = "Network or Unix socket IO, should not be needed if not network facing",
447 .value =
448 "accept4\0"
449 "accept\0"
450 "bind\0"
451 "connect\0"
452 "getpeername\0"
453 "getsockname\0"
454 "getsockopt\0"
455 "listen\0"
456 "recv\0"
457 "recvfrom\0"
458 "recvmmsg\0"
459 "recvmsg\0"
460 "send\0"
461 "sendmmsg\0"
462 "sendmsg\0"
463 "sendto\0"
464 "setsockopt\0"
465 "shutdown\0"
466 "socket\0"
467 "socketcall\0"
468 "socketpair\0"
469 },
470 [SYSCALL_FILTER_SET_OBSOLETE] = {
471 /* some unknown even to libseccomp */
472 .name = "@obsolete",
473 .help = "Unusual, obsolete or unimplemented system calls",
474 .value =
475 "_sysctl\0"
476 "afs_syscall\0"
477 "bdflush\0"
478 "break\0"
479 "create_module\0"
480 "ftime\0"
481 "get_kernel_syms\0"
482 "getpmsg\0"
483 "gtty\0"
484 "lock\0"
485 "mpx\0"
486 "prof\0"
487 "profil\0"
488 "putpmsg\0"
489 "query_module\0"
490 "security\0"
491 "sgetmask\0"
492 "ssetmask\0"
493 "stty\0"
494 "sysfs\0"
495 "tuxcall\0"
496 "ulimit\0"
497 "uselib\0"
498 "ustat\0"
499 "vserver\0"
500 },
501 [SYSCALL_FILTER_SET_PRIVILEGED] = {
502 .name = "@privileged",
503 .help = "All system calls which need super-user capabilities",
504 .value =
505 "@clock\0"
506 "@module\0"
507 "@raw-io\0"
508 "acct\0"
509 "bpf\0"
510 "capset\0"
511 "chown32\0"
512 "chown\0"
513 "chroot\0"
514 "fchown32\0"
515 "fchown\0"
516 "fchownat\0"
517 "kexec_file_load\0"
518 "kexec_load\0"
519 "lchown32\0"
520 "lchown\0"
521 "nfsservctl\0"
522 "pivot_root\0"
523 "quotactl\0"
524 "reboot\0"
525 "setdomainname\0"
526 "setfsuid32\0"
527 "setfsuid\0"
528 "setgroups32\0"
529 "setgroups\0"
530 "sethostname\0"
531 "setresuid32\0"
532 "setresuid\0"
533 "setreuid32\0"
534 "setreuid\0"
535 "setuid32\0"
536 "setuid\0"
537 "swapoff\0"
538 "swapon\0"
539 "_sysctl\0"
540 "vhangup\0"
541 },
542 [SYSCALL_FILTER_SET_PROCESS] = {
543 .name = "@process",
544 .help = "Process control, execution, namespaceing operations",
545 .value =
546 "arch_prctl\0"
547 "clone\0"
548 "execveat\0"
549 "fork\0"
550 "kill\0"
551 "prctl\0"
552 "setns\0"
553 "tgkill\0"
554 "tkill\0"
555 "unshare\0"
556 "vfork\0"
557 },
558 [SYSCALL_FILTER_SET_RAW_IO] = {
559 .name = "@raw-io",
560 .help = "Raw I/O port access",
561 .value =
562 "ioperm\0"
563 "iopl\0"
564 "pciconfig_iobase\0"
565 "pciconfig_read\0"
566 "pciconfig_write\0"
567 #ifdef __NR_s390_pci_mmio_read
568 "s390_pci_mmio_read\0"
569 #endif
570 #ifdef __NR_s390_pci_mmio_write
571 "s390_pci_mmio_write\0"
572 #endif
573 },
574 [SYSCALL_FILTER_SET_REBOOT] = {
575 .name = "@reboot",
576 .help = "Reboot and reboot preparation/kexec",
577 .value =
578 "kexec\0"
579 "kexec_file_load\0"
580 "reboot\0"
581 },
582 [SYSCALL_FILTER_SET_RESOURCES] = {
583 .name = "@resources",
584 .help = "Alter resource settings",
585 .value =
586 "sched_setparam\0"
587 "sched_setscheduler\0"
588 "sched_setaffinity\0"
589 "setpriority\0"
590 "setrlimit\0"
591 "set_mempolicy\0"
592 "migrate_pages\0"
593 "move_pages\0"
594 "mbind\0"
595 "sched_setattr\0"
596 "prlimit64\0"
597 },
598 [SYSCALL_FILTER_SET_SWAP] = {
599 .name = "@swap",
600 .help = "Enable/disable swap devices",
601 .value =
602 "swapoff\0"
603 "swapon\0"
604 },
605 };
606
607 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
608 unsigned i;
609
610 if (isempty(name) || name[0] != '@')
611 return NULL;
612
613 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
614 if (streq(syscall_filter_sets[i].name, name))
615 return syscall_filter_sets + i;
616
617 return NULL;
618 }
619
620 static int seccomp_add_syscall_filter_set(
621 scmp_filter_ctx seccomp,
622 uint32_t default_action,
623 const SyscallFilterSet *set,
624 uint32_t action) {
625
626 const char *sys;
627 int r;
628
629 assert(seccomp);
630 assert(set);
631
632 NULSTR_FOREACH(sys, set->value) {
633 int id;
634
635 if (sys[0] == '@') {
636 const SyscallFilterSet *other;
637
638 other = syscall_filter_set_find(sys);
639 if (!other)
640 return -EINVAL;
641
642 r = seccomp_add_syscall_filter_set(seccomp, default_action, other, action);
643 if (r < 0)
644 return r;
645 } else {
646 id = seccomp_syscall_resolve_name(sys);
647 if (id == __NR_SCMP_ERROR)
648 return -EINVAL; /* Not known at all? Then that's a real error */
649
650 r = seccomp_rule_add_exact(seccomp, action, id, 0);
651 if (r < 0)
652 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
653 log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", sys);
654 }
655 }
656
657 return 0;
658 }
659
660 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
661 uint32_t arch;
662 int r;
663
664 assert(set);
665
666 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
667 * earch local arch. */
668
669 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
670 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
671
672 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
673
674 r = seccomp_init_for_arch(&seccomp, arch, default_action);
675 if (r < 0)
676 return r;
677
678 r = seccomp_add_syscall_filter_set(seccomp, default_action, set, action);
679 if (r < 0) {
680 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
681 continue;
682 }
683
684 r = seccomp_load(seccomp);
685 if (IN_SET(r, -EPERM, -EACCES))
686 return r;
687 if (r < 0)
688 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
689 }
690
691 return 0;
692 }
693
694 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
695 uint32_t arch;
696 int r;
697
698 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
699 * SyscallFilterSet* table. */
700
701 if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
702 return 0;
703
704 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
705 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
706 Iterator i;
707 void *id;
708
709 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
710
711 r = seccomp_init_for_arch(&seccomp, arch, default_action);
712 if (r < 0)
713 return r;
714
715 SET_FOREACH(id, set, i) {
716 r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
717 if (r < 0) {
718 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
719 _cleanup_free_ char *n = NULL;
720
721 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
722 log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", strna(n));
723 }
724 }
725
726 r = seccomp_load(seccomp);
727 if (IN_SET(r, -EPERM, -EACCES))
728 return r;
729 if (r < 0)
730 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
731 }
732
733 return 0;
734 }
735
736 int seccomp_restrict_namespaces(unsigned long retain) {
737 uint32_t arch;
738 int r;
739
740 if (log_get_max_level() >= LOG_DEBUG) {
741 _cleanup_free_ char *s = NULL;
742
743 (void) namespace_flag_to_string_many(retain, &s);
744 log_debug("Restricting namespace to: %s.", strna(s));
745 }
746
747 /* NOOP? */
748 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
749 return 0;
750
751 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
752 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
753 unsigned i;
754
755 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
756
757 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
758 if (r < 0)
759 return r;
760
761 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
762 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
763 * altogether. */
764 r = seccomp_rule_add_exact(
765 seccomp,
766 SCMP_ACT_ERRNO(EPERM),
767 SCMP_SYS(setns),
768 0);
769 else
770 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
771 * special invocation with a zero flags argument, right here. */
772 r = seccomp_rule_add_exact(
773 seccomp,
774 SCMP_ACT_ERRNO(EPERM),
775 SCMP_SYS(setns),
776 1,
777 SCMP_A1(SCMP_CMP_EQ, 0));
778 if (r < 0) {
779 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
780 continue;
781 }
782
783 for (i = 0; namespace_flag_map[i].name; i++) {
784 unsigned long f;
785
786 f = namespace_flag_map[i].flag;
787 if ((retain & f) == f) {
788 log_debug("Permitting %s.", namespace_flag_map[i].name);
789 continue;
790 }
791
792 log_debug("Blocking %s.", namespace_flag_map[i].name);
793
794 r = seccomp_rule_add_exact(
795 seccomp,
796 SCMP_ACT_ERRNO(EPERM),
797 SCMP_SYS(unshare),
798 1,
799 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
800 if (r < 0) {
801 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
802 break;
803 }
804
805 r = seccomp_rule_add_exact(
806 seccomp,
807 SCMP_ACT_ERRNO(EPERM),
808 SCMP_SYS(clone),
809 1,
810 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
811 if (r < 0) {
812 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
813 break;
814 }
815
816 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
817 r = seccomp_rule_add_exact(
818 seccomp,
819 SCMP_ACT_ERRNO(EPERM),
820 SCMP_SYS(setns),
821 1,
822 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
823 if (r < 0) {
824 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
825 break;
826 }
827 }
828 }
829 if (r < 0)
830 continue;
831
832 r = seccomp_load(seccomp);
833 if (IN_SET(r, -EPERM, -EACCES))
834 return r;
835 if (r < 0)
836 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
837 }
838
839 return 0;
840 }
841
842 int seccomp_protect_sysctl(void) {
843 uint32_t arch;
844 int r;
845
846 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
847 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
848
849 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
850
851 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
852 if (r < 0)
853 return r;
854
855 r = seccomp_rule_add_exact(
856 seccomp,
857 SCMP_ACT_ERRNO(EPERM),
858 SCMP_SYS(_sysctl),
859 0);
860 if (r < 0) {
861 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
862 continue;
863 }
864
865 r = seccomp_load(seccomp);
866 if (IN_SET(r, -EPERM, -EACCES))
867 return r;
868 if (r < 0)
869 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
870 }
871
872 return 0;
873 }
874
875 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
876 uint32_t arch;
877 int r;
878
879 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
880 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
881 Iterator i;
882
883 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
884
885 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
886 if (r < 0)
887 return r;
888
889 if (whitelist) {
890 int af, first = 0, last = 0;
891 void *afp;
892
893 /* If this is a whitelist, we first block the address families that are out of range and then
894 * everything that is not in the set. First, we find the lowest and highest address family in
895 * the set. */
896
897 SET_FOREACH(afp, address_families, i) {
898 af = PTR_TO_INT(afp);
899
900 if (af <= 0 || af >= af_max())
901 continue;
902
903 if (first == 0 || af < first)
904 first = af;
905
906 if (last == 0 || af > last)
907 last = af;
908 }
909
910 assert((first == 0) == (last == 0));
911
912 if (first == 0) {
913
914 /* No entries in the valid range, block everything */
915 r = seccomp_rule_add_exact(
916 seccomp,
917 SCMP_ACT_ERRNO(EAFNOSUPPORT),
918 SCMP_SYS(socket),
919 0);
920 if (r < 0) {
921 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
922 continue;
923 }
924
925 } else {
926
927 /* Block everything below the first entry */
928 r = seccomp_rule_add_exact(
929 seccomp,
930 SCMP_ACT_ERRNO(EAFNOSUPPORT),
931 SCMP_SYS(socket),
932 1,
933 SCMP_A0(SCMP_CMP_LT, first));
934 if (r < 0) {
935 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
936 continue;
937 }
938
939 /* Block everything above the last entry */
940 r = seccomp_rule_add_exact(
941 seccomp,
942 SCMP_ACT_ERRNO(EAFNOSUPPORT),
943 SCMP_SYS(socket),
944 1,
945 SCMP_A0(SCMP_CMP_GT, last));
946 if (r < 0) {
947 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
948 continue;
949 }
950
951 /* Block everything between the first and last entry */
952 for (af = 1; af < af_max(); af++) {
953
954 if (set_contains(address_families, INT_TO_PTR(af)))
955 continue;
956
957 r = seccomp_rule_add_exact(
958 seccomp,
959 SCMP_ACT_ERRNO(EAFNOSUPPORT),
960 SCMP_SYS(socket),
961 1,
962 SCMP_A0(SCMP_CMP_EQ, af));
963 if (r < 0)
964 break;
965 }
966
967 if (r < 0) {
968 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
969 continue;
970 }
971 }
972
973 } else {
974 void *af;
975
976 /* If this is a blacklist, then generate one rule for
977 * each address family that are then combined in OR
978 * checks. */
979
980 SET_FOREACH(af, address_families, i) {
981
982 r = seccomp_rule_add_exact(
983 seccomp,
984 SCMP_ACT_ERRNO(EAFNOSUPPORT),
985 SCMP_SYS(socket),
986 1,
987 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
988 if (r < 0)
989 break;
990 }
991
992 if (r < 0) {
993 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
994 continue;
995 }
996 }
997
998 r = seccomp_load(seccomp);
999 if (IN_SET(r, -EPERM, -EACCES))
1000 return r;
1001 if (r < 0)
1002 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1003 }
1004
1005 return 0;
1006 }
1007
1008 int seccomp_restrict_realtime(void) {
1009 static const int permitted_policies[] = {
1010 SCHED_OTHER,
1011 SCHED_BATCH,
1012 SCHED_IDLE,
1013 };
1014
1015 int r, max_policy = 0;
1016 uint32_t arch;
1017 unsigned i;
1018
1019 /* Determine the highest policy constant we want to allow */
1020 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1021 if (permitted_policies[i] > max_policy)
1022 max_policy = permitted_policies[i];
1023
1024 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1025 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1026 int p;
1027
1028 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1029
1030 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1031 if (r < 0)
1032 return r;
1033
1034 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1035 * whitelist. */
1036 for (p = 0; p < max_policy; p++) {
1037 bool good = false;
1038
1039 /* Check if this is in the whitelist. */
1040 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1041 if (permitted_policies[i] == p) {
1042 good = true;
1043 break;
1044 }
1045
1046 if (good)
1047 continue;
1048
1049 /* Deny this policy */
1050 r = seccomp_rule_add_exact(
1051 seccomp,
1052 SCMP_ACT_ERRNO(EPERM),
1053 SCMP_SYS(sched_setscheduler),
1054 1,
1055 SCMP_A1(SCMP_CMP_EQ, p));
1056 if (r < 0) {
1057 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1058 continue;
1059 }
1060 }
1061
1062 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1063 * unsigned here, hence no need no check for < 0 values. */
1064 r = seccomp_rule_add_exact(
1065 seccomp,
1066 SCMP_ACT_ERRNO(EPERM),
1067 SCMP_SYS(sched_setscheduler),
1068 1,
1069 SCMP_A1(SCMP_CMP_GT, max_policy));
1070 if (r < 0) {
1071 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1072 continue;
1073 }
1074
1075 r = seccomp_load(seccomp);
1076 if (IN_SET(r, -EPERM, -EACCES))
1077 return r;
1078 if (r < 0)
1079 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1080 }
1081
1082 return 0;
1083 }
1084
1085 int seccomp_memory_deny_write_execute(void) {
1086 uint32_t arch;
1087 int r;
1088
1089 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1090 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1091
1092 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1093
1094 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1095 if (r < 0)
1096 return r;
1097
1098 r = seccomp_rule_add_exact(
1099 seccomp,
1100 SCMP_ACT_ERRNO(EPERM),
1101 SCMP_SYS(mmap),
1102 1,
1103 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1104 if (r < 0) {
1105 log_debug_errno(r, "Failed to add mmap() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1106 continue;
1107 }
1108
1109 r = seccomp_rule_add_exact(
1110 seccomp,
1111 SCMP_ACT_ERRNO(EPERM),
1112 SCMP_SYS(mprotect),
1113 1,
1114 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1115 if (r < 0) {
1116 log_debug_errno(r, "Failed to add mprotect() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1117 continue;
1118 }
1119
1120 r = seccomp_rule_add_exact(
1121 seccomp,
1122 SCMP_ACT_ERRNO(EPERM),
1123 SCMP_SYS(shmat),
1124 1,
1125 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1126 if (r < 0) {
1127 log_debug_errno(r, "Failed to add shmat() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1128 continue;
1129 }
1130
1131 r = seccomp_load(seccomp);
1132 if (IN_SET(r, -EPERM, -EACCES))
1133 return r;
1134 if (r < 0)
1135 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1136 }
1137
1138 return 0;
1139 }
1140
1141 int seccomp_restrict_archs(Set *archs) {
1142 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1143 Iterator i;
1144 void *id;
1145 int r;
1146
1147 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1148 * list. */
1149
1150 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1151 if (!seccomp)
1152 return -ENOMEM;
1153
1154 SET_FOREACH(id, archs, i) {
1155 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1156 if (r == -EEXIST)
1157 continue;
1158 if (r < 0)
1159 return r;
1160 }
1161
1162 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1163 if (r < 0)
1164 return r;
1165
1166 return seccomp_load(seccomp);
1167 }