]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #6428 from boucman/device_reload
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <linux/seccomp.h>
22 #include <seccomp.h>
23 #include <stddef.h>
24 #include <sys/mman.h>
25 #include <sys/prctl.h>
26 #include <sys/shm.h>
27
28 #include "af-list.h"
29 #include "alloc-util.h"
30 #include "macro.h"
31 #include "nsflags.h"
32 #include "process-util.h"
33 #include "seccomp-util.h"
34 #include "set.h"
35 #include "string-util.h"
36 #include "strv.h"
37 #include "util.h"
38 #include "errno-list.h"
39
40 const uint32_t seccomp_local_archs[] = {
41
42 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
43
44 #if defined(__x86_64__) && defined(__ILP32__)
45 SCMP_ARCH_X86,
46 SCMP_ARCH_X86_64,
47 SCMP_ARCH_X32, /* native */
48 #elif defined(__x86_64__) && !defined(__ILP32__)
49 SCMP_ARCH_X86,
50 SCMP_ARCH_X32,
51 SCMP_ARCH_X86_64, /* native */
52 #elif defined(__i386__)
53 SCMP_ARCH_X86,
54 #elif defined(__aarch64__)
55 SCMP_ARCH_ARM,
56 SCMP_ARCH_AARCH64, /* native */
57 #elif defined(__arm__)
58 SCMP_ARCH_ARM,
59 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
63 SCMP_ARCH_MIPS,
64 SCMP_ARCH_MIPSEL, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64N32,
69 SCMP_ARCH_MIPS64N32,
70 SCMP_ARCH_MIPSEL64,
71 SCMP_ARCH_MIPS64, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64N32,
76 SCMP_ARCH_MIPSEL64N32,
77 SCMP_ARCH_MIPS64,
78 SCMP_ARCH_MIPSEL64, /* native */
79 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPSEL,
81 SCMP_ARCH_MIPS,
82 SCMP_ARCH_MIPSEL64,
83 SCMP_ARCH_MIPS64,
84 SCMP_ARCH_MIPSEL64N32,
85 SCMP_ARCH_MIPS64N32, /* native */
86 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
87 SCMP_ARCH_MIPS,
88 SCMP_ARCH_MIPSEL,
89 SCMP_ARCH_MIPS64,
90 SCMP_ARCH_MIPSEL64,
91 SCMP_ARCH_MIPS64N32,
92 SCMP_ARCH_MIPSEL64N32, /* native */
93 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
94 SCMP_ARCH_PPC,
95 SCMP_ARCH_PPC64LE,
96 SCMP_ARCH_PPC64, /* native */
97 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
98 SCMP_ARCH_PPC,
99 SCMP_ARCH_PPC64,
100 SCMP_ARCH_PPC64LE, /* native */
101 #elif defined(__powerpc__)
102 SCMP_ARCH_PPC,
103 #elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106 #elif defined(__s390__)
107 SCMP_ARCH_S390,
108 #endif
109 (uint32_t) -1
110 };
111
112 const char* seccomp_arch_to_string(uint32_t c) {
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
117
118 switch(c) {
119 case SCMP_ARCH_NATIVE:
120 return "native";
121 case SCMP_ARCH_X86:
122 return "x86";
123 case SCMP_ARCH_X86_64:
124 return "x86-64";
125 case SCMP_ARCH_X32:
126 return "x32";
127 case SCMP_ARCH_ARM:
128 return "arm";
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
131 case SCMP_ARCH_MIPS:
132 return "mips";
133 case SCMP_ARCH_MIPS64:
134 return "mips64";
135 case SCMP_ARCH_MIPS64N32:
136 return "mips64-n32";
137 case SCMP_ARCH_MIPSEL:
138 return "mips-le";
139 case SCMP_ARCH_MIPSEL64:
140 return "mips64-le";
141 case SCMP_ARCH_MIPSEL64N32:
142 return "mips64-le-n32";
143 case SCMP_ARCH_PPC:
144 return "ppc";
145 case SCMP_ARCH_PPC64:
146 return "ppc64";
147 case SCMP_ARCH_PPC64LE:
148 return "ppc64-le";
149 case SCMP_ARCH_S390:
150 return "s390";
151 case SCMP_ARCH_S390X:
152 return "s390x";
153 default:
154 return NULL;
155 }
156 }
157
158 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
159 if (!n)
160 return -EINVAL;
161
162 assert(ret);
163
164 if (streq(n, "native"))
165 *ret = SCMP_ARCH_NATIVE;
166 else if (streq(n, "x86"))
167 *ret = SCMP_ARCH_X86;
168 else if (streq(n, "x86-64"))
169 *ret = SCMP_ARCH_X86_64;
170 else if (streq(n, "x32"))
171 *ret = SCMP_ARCH_X32;
172 else if (streq(n, "arm"))
173 *ret = SCMP_ARCH_ARM;
174 else if (streq(n, "arm64"))
175 *ret = SCMP_ARCH_AARCH64;
176 else if (streq(n, "mips"))
177 *ret = SCMP_ARCH_MIPS;
178 else if (streq(n, "mips64"))
179 *ret = SCMP_ARCH_MIPS64;
180 else if (streq(n, "mips64-n32"))
181 *ret = SCMP_ARCH_MIPS64N32;
182 else if (streq(n, "mips-le"))
183 *ret = SCMP_ARCH_MIPSEL;
184 else if (streq(n, "mips64-le"))
185 *ret = SCMP_ARCH_MIPSEL64;
186 else if (streq(n, "mips64-le-n32"))
187 *ret = SCMP_ARCH_MIPSEL64N32;
188 else if (streq(n, "ppc"))
189 *ret = SCMP_ARCH_PPC;
190 else if (streq(n, "ppc64"))
191 *ret = SCMP_ARCH_PPC64;
192 else if (streq(n, "ppc64-le"))
193 *ret = SCMP_ARCH_PPC64LE;
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
198 else
199 return -EINVAL;
200
201 return 0;
202 }
203
204 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
205 scmp_filter_ctx seccomp;
206 int r;
207
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
219 if (r < 0)
220 goto finish;
221
222 r = seccomp_arch_add(seccomp, arch);
223 if (r < 0)
224 goto finish;
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
235 if (r < 0)
236 goto finish;
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
240 goto finish;
241
242 *ret = seccomp;
243 return 0;
244
245 finish:
246 seccomp_release(seccomp);
247 return r;
248 }
249
250 static bool is_basic_seccomp_available(void) {
251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
252 }
253
254 static bool is_seccomp_filter_available(void) {
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
257 }
258
259 bool is_seccomp_available(void) {
260 static int cached_enabled = -1;
261
262 if (cached_enabled < 0)
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
267 return cached_enabled;
268 }
269
270 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
271 [SYSCALL_FILTER_SET_DEFAULT] = {
272 .name = "@default",
273 .help = "System calls that are always permitted",
274 .value =
275 "clock_getres\0"
276 "clock_gettime\0"
277 "clock_nanosleep\0"
278 "execve\0"
279 "exit\0"
280 "exit_group\0"
281 "getrlimit\0" /* make sure processes can query stack size and such */
282 "gettimeofday\0"
283 "nanosleep\0"
284 "pause\0"
285 "rt_sigreturn\0"
286 "sigreturn\0"
287 "time\0"
288 },
289 [SYSCALL_FILTER_SET_BASIC_IO] = {
290 .name = "@basic-io",
291 .help = "Basic IO",
292 .value =
293 "close\0"
294 "dup2\0"
295 "dup3\0"
296 "dup\0"
297 "lseek\0"
298 "pread64\0"
299 "preadv\0"
300 "pwrite64\0"
301 "pwritev\0"
302 "read\0"
303 "readv\0"
304 "write\0"
305 "writev\0"
306 },
307 [SYSCALL_FILTER_SET_CLOCK] = {
308 .name = "@clock",
309 .help = "Change the system time",
310 .value =
311 "adjtimex\0"
312 "clock_adjtime\0"
313 "clock_settime\0"
314 "settimeofday\0"
315 "stime\0"
316 },
317 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
318 .name = "@cpu-emulation",
319 .help = "System calls for CPU emulation functionality",
320 .value =
321 "modify_ldt\0"
322 "subpage_prot\0"
323 "switch_endian\0"
324 "vm86\0"
325 "vm86old\0"
326 },
327 [SYSCALL_FILTER_SET_DEBUG] = {
328 .name = "@debug",
329 .help = "Debugging, performance monitoring and tracing functionality",
330 .value =
331 "lookup_dcookie\0"
332 "perf_event_open\0"
333 "process_vm_readv\0"
334 "process_vm_writev\0"
335 "ptrace\0"
336 "rtas\0"
337 #ifdef __NR_s390_runtime_instr
338 "s390_runtime_instr\0"
339 #endif
340 "sys_debug_setcontext\0"
341 },
342 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
343 .name = "@file-system",
344 .help = "File system operations",
345 .value =
346 "access\0"
347 "chdir\0"
348 "chmod\0"
349 "close\0"
350 "creat\0"
351 "faccessat\0"
352 "fallocate\0"
353 "fchdir\0"
354 "fchmod\0"
355 "fchmodat\0"
356 "fcntl64\0"
357 "fcntl\0"
358 "fgetxattr\0"
359 "flistxattr\0"
360 "fsetxattr\0"
361 "fstat64\0"
362 "fstat\0"
363 "fstatat64\0"
364 "fstatfs64\0"
365 "fstatfs\0"
366 "ftruncate64\0"
367 "ftruncate\0"
368 "futimesat\0"
369 "getcwd\0"
370 "getdents64\0"
371 "getdents\0"
372 "getxattr\0"
373 "inotify_add_watch\0"
374 "inotify_init1\0"
375 "inotify_rm_watch\0"
376 "lgetxattr\0"
377 "link\0"
378 "linkat\0"
379 "listxattr\0"
380 "llistxattr\0"
381 "lremovexattr\0"
382 "lsetxattr\0"
383 "lstat64\0"
384 "lstat\0"
385 "mkdir\0"
386 "mkdirat\0"
387 "mknod\0"
388 "mknodat\0"
389 "mmap2\0"
390 "mmap\0"
391 "munmap\0"
392 "newfstatat\0"
393 "open\0"
394 "openat\0"
395 "readlink\0"
396 "readlinkat\0"
397 "removexattr\0"
398 "rename\0"
399 "renameat2\0"
400 "renameat\0"
401 "rmdir\0"
402 "setxattr\0"
403 "stat64\0"
404 "stat\0"
405 "statfs\0"
406 "statx\0"
407 "symlink\0"
408 "symlinkat\0"
409 "truncate64\0"
410 "truncate\0"
411 "unlink\0"
412 "unlinkat\0"
413 "utimensat\0"
414 "utimes\0"
415 },
416 [SYSCALL_FILTER_SET_IO_EVENT] = {
417 .name = "@io-event",
418 .help = "Event loop system calls",
419 .value =
420 "_newselect\0"
421 "epoll_create1\0"
422 "epoll_create\0"
423 "epoll_ctl\0"
424 "epoll_ctl_old\0"
425 "epoll_pwait\0"
426 "epoll_wait\0"
427 "epoll_wait_old\0"
428 "eventfd2\0"
429 "eventfd\0"
430 "poll\0"
431 "ppoll\0"
432 "pselect6\0"
433 "select\0"
434 },
435 [SYSCALL_FILTER_SET_IPC] = {
436 .name = "@ipc",
437 .help = "SysV IPC, POSIX Message Queues or other IPC",
438 .value =
439 "ipc\0"
440 "memfd_create\0"
441 "mq_getsetattr\0"
442 "mq_notify\0"
443 "mq_open\0"
444 "mq_timedreceive\0"
445 "mq_timedsend\0"
446 "mq_unlink\0"
447 "msgctl\0"
448 "msgget\0"
449 "msgrcv\0"
450 "msgsnd\0"
451 "pipe2\0"
452 "pipe\0"
453 "process_vm_readv\0"
454 "process_vm_writev\0"
455 "semctl\0"
456 "semget\0"
457 "semop\0"
458 "semtimedop\0"
459 "shmat\0"
460 "shmctl\0"
461 "shmdt\0"
462 "shmget\0"
463 },
464 [SYSCALL_FILTER_SET_KEYRING] = {
465 .name = "@keyring",
466 .help = "Kernel keyring access",
467 .value =
468 "add_key\0"
469 "keyctl\0"
470 "request_key\0"
471 },
472 [SYSCALL_FILTER_SET_MODULE] = {
473 .name = "@module",
474 .help = "Loading and unloading of kernel modules",
475 .value =
476 "delete_module\0"
477 "finit_module\0"
478 "init_module\0"
479 },
480 [SYSCALL_FILTER_SET_MOUNT] = {
481 .name = "@mount",
482 .help = "Mounting and unmounting of file systems",
483 .value =
484 "chroot\0"
485 "mount\0"
486 "pivot_root\0"
487 "umount2\0"
488 "umount\0"
489 },
490 [SYSCALL_FILTER_SET_NETWORK_IO] = {
491 .name = "@network-io",
492 .help = "Network or Unix socket IO, should not be needed if not network facing",
493 .value =
494 "accept4\0"
495 "accept\0"
496 "bind\0"
497 "connect\0"
498 "getpeername\0"
499 "getsockname\0"
500 "getsockopt\0"
501 "listen\0"
502 "recv\0"
503 "recvfrom\0"
504 "recvmmsg\0"
505 "recvmsg\0"
506 "send\0"
507 "sendmmsg\0"
508 "sendmsg\0"
509 "sendto\0"
510 "setsockopt\0"
511 "shutdown\0"
512 "socket\0"
513 "socketcall\0"
514 "socketpair\0"
515 },
516 [SYSCALL_FILTER_SET_OBSOLETE] = {
517 /* some unknown even to libseccomp */
518 .name = "@obsolete",
519 .help = "Unusual, obsolete or unimplemented system calls",
520 .value =
521 "_sysctl\0"
522 "afs_syscall\0"
523 "bdflush\0"
524 "break\0"
525 "create_module\0"
526 "ftime\0"
527 "get_kernel_syms\0"
528 "getpmsg\0"
529 "gtty\0"
530 "lock\0"
531 "mpx\0"
532 "prof\0"
533 "profil\0"
534 "putpmsg\0"
535 "query_module\0"
536 "security\0"
537 "sgetmask\0"
538 "ssetmask\0"
539 "stty\0"
540 "sysfs\0"
541 "tuxcall\0"
542 "ulimit\0"
543 "uselib\0"
544 "ustat\0"
545 "vserver\0"
546 },
547 [SYSCALL_FILTER_SET_PRIVILEGED] = {
548 .name = "@privileged",
549 .help = "All system calls which need super-user capabilities",
550 .value =
551 "@clock\0"
552 "@module\0"
553 "@raw-io\0"
554 "acct\0"
555 "bpf\0"
556 "capset\0"
557 "chown32\0"
558 "chown\0"
559 "chroot\0"
560 "fchown32\0"
561 "fchown\0"
562 "fchownat\0"
563 "kexec_file_load\0"
564 "kexec_load\0"
565 "lchown32\0"
566 "lchown\0"
567 "nfsservctl\0"
568 "pivot_root\0"
569 "quotactl\0"
570 "reboot\0"
571 "setdomainname\0"
572 "setfsuid32\0"
573 "setfsuid\0"
574 "setgroups32\0"
575 "setgroups\0"
576 "sethostname\0"
577 "setresuid32\0"
578 "setresuid\0"
579 "setreuid32\0"
580 "setreuid\0"
581 "setuid32\0"
582 "setuid\0"
583 "swapoff\0"
584 "swapon\0"
585 "_sysctl\0"
586 "vhangup\0"
587 },
588 [SYSCALL_FILTER_SET_PROCESS] = {
589 .name = "@process",
590 .help = "Process control, execution, namespaceing operations",
591 .value =
592 "arch_prctl\0"
593 "clone\0"
594 "execveat\0"
595 "fork\0"
596 "kill\0"
597 "prctl\0"
598 "setns\0"
599 "tgkill\0"
600 "tkill\0"
601 "unshare\0"
602 "vfork\0"
603 },
604 [SYSCALL_FILTER_SET_RAW_IO] = {
605 .name = "@raw-io",
606 .help = "Raw I/O port access",
607 .value =
608 "ioperm\0"
609 "iopl\0"
610 "pciconfig_iobase\0"
611 "pciconfig_read\0"
612 "pciconfig_write\0"
613 #ifdef __NR_s390_pci_mmio_read
614 "s390_pci_mmio_read\0"
615 #endif
616 #ifdef __NR_s390_pci_mmio_write
617 "s390_pci_mmio_write\0"
618 #endif
619 },
620 [SYSCALL_FILTER_SET_REBOOT] = {
621 .name = "@reboot",
622 .help = "Reboot and reboot preparation/kexec",
623 .value =
624 "kexec\0"
625 "kexec_file_load\0"
626 "reboot\0"
627 },
628 [SYSCALL_FILTER_SET_RESOURCES] = {
629 .name = "@resources",
630 .help = "Alter resource settings",
631 .value =
632 "sched_setparam\0"
633 "sched_setscheduler\0"
634 "sched_setaffinity\0"
635 "setpriority\0"
636 "setrlimit\0"
637 "set_mempolicy\0"
638 "migrate_pages\0"
639 "move_pages\0"
640 "mbind\0"
641 "sched_setattr\0"
642 "prlimit64\0"
643 },
644 [SYSCALL_FILTER_SET_SETUID] = {
645 .name = "@setuid",
646 .help = "Operations for changing user/group credentials",
647 .value =
648 "setgid32\0"
649 "setgid\0"
650 "setgroups32\0"
651 "setgroups\0"
652 "setregid32\0"
653 "setregid\0"
654 "setresgid32\0"
655 "setresgid\0"
656 "setresuid32\0"
657 "setresuid\0"
658 "setreuid32\0"
659 "setreuid\0"
660 "setuid32\0"
661 "setuid\0"
662 },
663 [SYSCALL_FILTER_SET_SWAP] = {
664 .name = "@swap",
665 .help = "Enable/disable swap devices",
666 .value =
667 "swapoff\0"
668 "swapon\0"
669 },
670 };
671
672 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
673 unsigned i;
674
675 if (isempty(name) || name[0] != '@')
676 return NULL;
677
678 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
679 if (streq(syscall_filter_sets[i].name, name))
680 return syscall_filter_sets + i;
681
682 return NULL;
683 }
684
685 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
686
687 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
688 int r;
689
690 assert(seccomp);
691 assert(name);
692
693 if (strv_contains(exclude, name))
694 return 0;
695
696 if (name[0] == '@') {
697 const SyscallFilterSet *other;
698
699 other = syscall_filter_set_find(name);
700 if (!other)
701 return -EINVAL;
702
703 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
704 if (r < 0)
705 return r;
706 } else {
707 int id;
708
709 id = seccomp_syscall_resolve_name(name);
710 if (id == __NR_SCMP_ERROR)
711 return -EINVAL; /* Not known at all? Then that's a real error */
712
713 r = seccomp_rule_add_exact(seccomp, action, id, 0);
714 if (r < 0)
715 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
716 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
717 }
718
719 return 0;
720 }
721
722 static int seccomp_add_syscall_filter_set(
723 scmp_filter_ctx seccomp,
724 const SyscallFilterSet *set,
725 uint32_t action,
726 char **exclude) {
727
728 const char *sys;
729 int r;
730
731 assert(seccomp);
732 assert(set);
733
734 NULSTR_FOREACH(sys, set->value) {
735 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
736 if (r < 0)
737 return r;
738 }
739
740 return 0;
741 }
742
743 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
744 uint32_t arch;
745 int r;
746
747 assert(set);
748
749 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
750 * earch local arch. */
751
752 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
753 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
754
755 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
756
757 r = seccomp_init_for_arch(&seccomp, arch, default_action);
758 if (r < 0)
759 return r;
760
761 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
762 if (r < 0) {
763 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
764 continue;
765 }
766
767 r = seccomp_load(seccomp);
768 if (IN_SET(r, -EPERM, -EACCES))
769 return r;
770 if (r < 0)
771 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
772 }
773
774 return 0;
775 }
776
777 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
778 uint32_t arch;
779 int r;
780
781 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
782 * SyscallFilterSet* table. */
783
784 if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
785 return 0;
786
787 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
788 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
789 Iterator i;
790 void *id;
791
792 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
793
794 r = seccomp_init_for_arch(&seccomp, arch, default_action);
795 if (r < 0)
796 return r;
797
798 SET_FOREACH(id, set, i) {
799 r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
800 if (r < 0) {
801 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
802 _cleanup_free_ char *n = NULL;
803
804 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
805 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
806 }
807 }
808
809 r = seccomp_load(seccomp);
810 if (IN_SET(r, -EPERM, -EACCES))
811 return r;
812 if (r < 0)
813 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
814 }
815
816 return 0;
817 }
818
819 int seccomp_restrict_namespaces(unsigned long retain) {
820 uint32_t arch;
821 int r;
822
823 if (log_get_max_level() >= LOG_DEBUG) {
824 _cleanup_free_ char *s = NULL;
825
826 (void) namespace_flag_to_string_many(retain, &s);
827 log_debug("Restricting namespace to: %s.", strna(s));
828 }
829
830 /* NOOP? */
831 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
832 return 0;
833
834 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
835 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
836 unsigned i;
837
838 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
839
840 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
841 if (r < 0)
842 return r;
843
844 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
845 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
846 * altogether. */
847 r = seccomp_rule_add_exact(
848 seccomp,
849 SCMP_ACT_ERRNO(EPERM),
850 SCMP_SYS(setns),
851 0);
852 else
853 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
854 * special invocation with a zero flags argument, right here. */
855 r = seccomp_rule_add_exact(
856 seccomp,
857 SCMP_ACT_ERRNO(EPERM),
858 SCMP_SYS(setns),
859 1,
860 SCMP_A1(SCMP_CMP_EQ, 0));
861 if (r < 0) {
862 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
863 continue;
864 }
865
866 for (i = 0; namespace_flag_map[i].name; i++) {
867 unsigned long f;
868
869 f = namespace_flag_map[i].flag;
870 if ((retain & f) == f) {
871 log_debug("Permitting %s.", namespace_flag_map[i].name);
872 continue;
873 }
874
875 log_debug("Blocking %s.", namespace_flag_map[i].name);
876
877 r = seccomp_rule_add_exact(
878 seccomp,
879 SCMP_ACT_ERRNO(EPERM),
880 SCMP_SYS(unshare),
881 1,
882 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
883 if (r < 0) {
884 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
885 break;
886 }
887
888 /* On s390/s390x the first two parameters to clone are switched */
889 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
890 r = seccomp_rule_add_exact(
891 seccomp,
892 SCMP_ACT_ERRNO(EPERM),
893 SCMP_SYS(clone),
894 1,
895 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
896 else
897 r = seccomp_rule_add_exact(
898 seccomp,
899 SCMP_ACT_ERRNO(EPERM),
900 SCMP_SYS(clone),
901 1,
902 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
903 if (r < 0) {
904 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
905 break;
906 }
907
908 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
909 r = seccomp_rule_add_exact(
910 seccomp,
911 SCMP_ACT_ERRNO(EPERM),
912 SCMP_SYS(setns),
913 1,
914 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
915 if (r < 0) {
916 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
917 break;
918 }
919 }
920 }
921 if (r < 0)
922 continue;
923
924 r = seccomp_load(seccomp);
925 if (IN_SET(r, -EPERM, -EACCES))
926 return r;
927 if (r < 0)
928 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
929 }
930
931 return 0;
932 }
933
934 int seccomp_protect_sysctl(void) {
935 uint32_t arch;
936 int r;
937
938 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
939 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
940
941 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
942
943 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
944 /* No _sysctl syscall */
945 continue;
946
947 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
948 if (r < 0)
949 return r;
950
951 r = seccomp_rule_add_exact(
952 seccomp,
953 SCMP_ACT_ERRNO(EPERM),
954 SCMP_SYS(_sysctl),
955 0);
956 if (r < 0) {
957 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
958 continue;
959 }
960
961 r = seccomp_load(seccomp);
962 if (IN_SET(r, -EPERM, -EACCES))
963 return r;
964 if (r < 0)
965 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
966 }
967
968 return 0;
969 }
970
971 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
972 uint32_t arch;
973 int r;
974
975 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
976 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
977 bool supported;
978 Iterator i;
979
980 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
981
982 switch (arch) {
983
984 case SCMP_ARCH_X86_64:
985 case SCMP_ARCH_X32:
986 case SCMP_ARCH_ARM:
987 case SCMP_ARCH_AARCH64:
988 case SCMP_ARCH_PPC64:
989 case SCMP_ARCH_PPC64LE:
990 /* These we know we support (i.e. are the ones that do not use socketcall()) */
991 supported = true;
992 break;
993
994 case SCMP_ARCH_S390:
995 case SCMP_ARCH_S390X:
996 case SCMP_ARCH_PPC:
997 case SCMP_ARCH_X86:
998 default:
999 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1000 * don't know */
1001 supported = false;
1002 break;
1003 }
1004
1005 if (!supported)
1006 continue;
1007
1008 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1009 if (r < 0)
1010 return r;
1011
1012 if (whitelist) {
1013 int af, first = 0, last = 0;
1014 void *afp;
1015
1016 /* If this is a whitelist, we first block the address families that are out of range and then
1017 * everything that is not in the set. First, we find the lowest and highest address family in
1018 * the set. */
1019
1020 SET_FOREACH(afp, address_families, i) {
1021 af = PTR_TO_INT(afp);
1022
1023 if (af <= 0 || af >= af_max())
1024 continue;
1025
1026 if (first == 0 || af < first)
1027 first = af;
1028
1029 if (last == 0 || af > last)
1030 last = af;
1031 }
1032
1033 assert((first == 0) == (last == 0));
1034
1035 if (first == 0) {
1036
1037 /* No entries in the valid range, block everything */
1038 r = seccomp_rule_add_exact(
1039 seccomp,
1040 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1041 SCMP_SYS(socket),
1042 0);
1043 if (r < 0) {
1044 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1045 continue;
1046 }
1047
1048 } else {
1049
1050 /* Block everything below the first entry */
1051 r = seccomp_rule_add_exact(
1052 seccomp,
1053 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1054 SCMP_SYS(socket),
1055 1,
1056 SCMP_A0(SCMP_CMP_LT, first));
1057 if (r < 0) {
1058 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1059 continue;
1060 }
1061
1062 /* Block everything above the last entry */
1063 r = seccomp_rule_add_exact(
1064 seccomp,
1065 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1066 SCMP_SYS(socket),
1067 1,
1068 SCMP_A0(SCMP_CMP_GT, last));
1069 if (r < 0) {
1070 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1071 continue;
1072 }
1073
1074 /* Block everything between the first and last entry */
1075 for (af = 1; af < af_max(); af++) {
1076
1077 if (set_contains(address_families, INT_TO_PTR(af)))
1078 continue;
1079
1080 r = seccomp_rule_add_exact(
1081 seccomp,
1082 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1083 SCMP_SYS(socket),
1084 1,
1085 SCMP_A0(SCMP_CMP_EQ, af));
1086 if (r < 0)
1087 break;
1088 }
1089
1090 if (r < 0) {
1091 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1092 continue;
1093 }
1094 }
1095
1096 } else {
1097 void *af;
1098
1099 /* If this is a blacklist, then generate one rule for
1100 * each address family that are then combined in OR
1101 * checks. */
1102
1103 SET_FOREACH(af, address_families, i) {
1104
1105 r = seccomp_rule_add_exact(
1106 seccomp,
1107 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1108 SCMP_SYS(socket),
1109 1,
1110 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1111 if (r < 0)
1112 break;
1113 }
1114
1115 if (r < 0) {
1116 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1117 continue;
1118 }
1119 }
1120
1121 r = seccomp_load(seccomp);
1122 if (IN_SET(r, -EPERM, -EACCES))
1123 return r;
1124 if (r < 0)
1125 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1126 }
1127
1128 return 0;
1129 }
1130
1131 int seccomp_restrict_realtime(void) {
1132 static const int permitted_policies[] = {
1133 SCHED_OTHER,
1134 SCHED_BATCH,
1135 SCHED_IDLE,
1136 };
1137
1138 int r, max_policy = 0;
1139 uint32_t arch;
1140 unsigned i;
1141
1142 /* Determine the highest policy constant we want to allow */
1143 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1144 if (permitted_policies[i] > max_policy)
1145 max_policy = permitted_policies[i];
1146
1147 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1148 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1149 int p;
1150
1151 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1152
1153 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1154 if (r < 0)
1155 return r;
1156
1157 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1158 * whitelist. */
1159 for (p = 0; p < max_policy; p++) {
1160 bool good = false;
1161
1162 /* Check if this is in the whitelist. */
1163 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1164 if (permitted_policies[i] == p) {
1165 good = true;
1166 break;
1167 }
1168
1169 if (good)
1170 continue;
1171
1172 /* Deny this policy */
1173 r = seccomp_rule_add_exact(
1174 seccomp,
1175 SCMP_ACT_ERRNO(EPERM),
1176 SCMP_SYS(sched_setscheduler),
1177 1,
1178 SCMP_A1(SCMP_CMP_EQ, p));
1179 if (r < 0) {
1180 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1181 continue;
1182 }
1183 }
1184
1185 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1186 * unsigned here, hence no need no check for < 0 values. */
1187 r = seccomp_rule_add_exact(
1188 seccomp,
1189 SCMP_ACT_ERRNO(EPERM),
1190 SCMP_SYS(sched_setscheduler),
1191 1,
1192 SCMP_A1(SCMP_CMP_GT, max_policy));
1193 if (r < 0) {
1194 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1195 continue;
1196 }
1197
1198 r = seccomp_load(seccomp);
1199 if (IN_SET(r, -EPERM, -EACCES))
1200 return r;
1201 if (r < 0)
1202 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1203 }
1204
1205 return 0;
1206 }
1207
1208 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1209 uint32_t arch,
1210 int nr,
1211 unsigned int arg_cnt,
1212 const struct scmp_arg_cmp arg) {
1213 int r;
1214
1215 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1216 if (r < 0) {
1217 _cleanup_free_ char *n = NULL;
1218
1219 n = seccomp_syscall_resolve_num_arch(arch, nr);
1220 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1221 strna(n),
1222 seccomp_arch_to_string(arch));
1223 }
1224
1225 return r;
1226 }
1227
1228 /* For known architectures, check that syscalls are indeed defined or not. */
1229 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1230 assert_cc(SCMP_SYS(shmget) > 0);
1231 assert_cc(SCMP_SYS(shmat) > 0);
1232 assert_cc(SCMP_SYS(shmdt) > 0);
1233 #elif defined(__i386__) || defined(__powerpc64__)
1234 assert_cc(SCMP_SYS(shmget) < 0);
1235 assert_cc(SCMP_SYS(shmat) < 0);
1236 assert_cc(SCMP_SYS(shmdt) < 0);
1237 #endif
1238
1239 int seccomp_memory_deny_write_execute(void) {
1240
1241 uint32_t arch;
1242 int r;
1243
1244 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1245 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1246 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1247
1248 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1249
1250 switch (arch) {
1251
1252 case SCMP_ARCH_X86:
1253 filter_syscall = SCMP_SYS(mmap2);
1254 block_syscall = SCMP_SYS(mmap);
1255 break;
1256
1257 case SCMP_ARCH_PPC64:
1258 case SCMP_ARCH_PPC64LE:
1259 filter_syscall = SCMP_SYS(mmap);
1260
1261 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1262 * We ignore that here, which means there's still a way to get writable/executable
1263 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1264
1265 break;
1266
1267 case SCMP_ARCH_ARM:
1268 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1269 shmat_syscall = SCMP_SYS(shmat);
1270 break;
1271
1272 case SCMP_ARCH_X86_64:
1273 case SCMP_ARCH_X32:
1274 case SCMP_ARCH_AARCH64:
1275 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1276 shmat_syscall = SCMP_SYS(shmat);
1277 break;
1278
1279 /* Please add more definitions here, if you port systemd to other architectures! */
1280
1281 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1282 #warning "Consider adding the right mmap() syscall definitions here!"
1283 #endif
1284 }
1285
1286 /* Can't filter mmap() on this arch, then skip it */
1287 if (filter_syscall == 0)
1288 continue;
1289
1290 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1291 if (r < 0)
1292 return r;
1293
1294 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1295 1,
1296 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1297 if (r < 0)
1298 continue;
1299
1300 if (block_syscall != 0) {
1301 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1302 if (r < 0)
1303 continue;
1304 }
1305
1306 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1307 1,
1308 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1309 if (r < 0)
1310 continue;
1311
1312 if (shmat_syscall != 0) {
1313 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1314 1,
1315 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1316 if (r < 0)
1317 continue;
1318 }
1319
1320 r = seccomp_load(seccomp);
1321 if (IN_SET(r, -EPERM, -EACCES))
1322 return r;
1323 if (r < 0)
1324 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1325 }
1326
1327 return 0;
1328 }
1329
1330 int seccomp_restrict_archs(Set *archs) {
1331 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1332 Iterator i;
1333 void *id;
1334 int r;
1335
1336 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1337 * list. */
1338
1339 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1340 if (!seccomp)
1341 return -ENOMEM;
1342
1343 SET_FOREACH(id, archs, i) {
1344 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1345 if (r == -EEXIST)
1346 continue;
1347 if (r < 0)
1348 return r;
1349 }
1350
1351 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1352 if (r < 0)
1353 return r;
1354
1355 return seccomp_load(seccomp);
1356 }
1357
1358 int parse_syscall_archs(char **l, Set **archs) {
1359 _cleanup_set_free_ Set *_archs;
1360 char **s;
1361 int r;
1362
1363 assert(l);
1364 assert(archs);
1365
1366 r = set_ensure_allocated(&_archs, NULL);
1367 if (r < 0)
1368 return r;
1369
1370 STRV_FOREACH(s, l) {
1371 uint32_t a;
1372
1373 r = seccomp_arch_from_string(*s, &a);
1374 if (r < 0)
1375 return -EINVAL;
1376
1377 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1378 if (r < 0)
1379 return -ENOMEM;
1380 }
1381
1382 *archs = _archs;
1383 _archs = NULL;
1384
1385 return 0;
1386 }
1387
1388 int seccomp_filter_set_add(Set *filter, bool add, const SyscallFilterSet *set) {
1389 const char *i;
1390 int r;
1391
1392 assert(set);
1393
1394 NULSTR_FOREACH(i, set->value) {
1395
1396 if (i[0] == '@') {
1397 const SyscallFilterSet *more;
1398
1399 more = syscall_filter_set_find(i);
1400 if (!more)
1401 return -ENXIO;
1402
1403
1404 r = seccomp_filter_set_add(filter, add, more);
1405 if (r < 0)
1406 return r;
1407 } else {
1408 int id;
1409
1410 id = seccomp_syscall_resolve_name(i);
1411 if (id == __NR_SCMP_ERROR)
1412 return -ENXIO;
1413
1414 if (add) {
1415 r = set_put(filter, INT_TO_PTR(id + 1));
1416 if (r < 0)
1417 return r;
1418 } else
1419 (void) set_remove(filter, INT_TO_PTR(id + 1));
1420 }
1421 }
1422
1423 return 0;
1424 }
1425
1426 int seccomp_lock_personality(unsigned long personality) {
1427 uint32_t arch;
1428 int r;
1429
1430 if (personality >= PERSONALITY_INVALID)
1431 return -EINVAL;
1432
1433 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1434 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1435
1436 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1437 if (r < 0)
1438 return r;
1439
1440 r = seccomp_rule_add_exact(
1441 seccomp,
1442 SCMP_ACT_ERRNO(EPERM),
1443 SCMP_SYS(personality),
1444 1,
1445 SCMP_A0(SCMP_CMP_NE, personality));
1446 if (r < 0)
1447 return r;
1448
1449 r = seccomp_load(seccomp);
1450 if (IN_SET(r, -EPERM, -EACCES))
1451 return r;
1452 if (r < 0)
1453 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1454 }
1455
1456 return 0;
1457 }