]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #23099 from yuwata/sd-bus-track-fixlets
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10 #include <sys/stat.h>
11
12 /* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13 #include "missing_syscall_def.h"
14 #include <seccomp.h>
15
16 #include "af-list.h"
17 #include "alloc-util.h"
18 #include "env-util.h"
19 #include "errno-list.h"
20 #include "macro.h"
21 #include "nsflags.h"
22 #include "nulstr-util.h"
23 #include "process-util.h"
24 #include "seccomp-util.h"
25 #include "set.h"
26 #include "string-util.h"
27 #include "strv.h"
28
29 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
30 uint32_t seccomp_local_archs[] = {
31
32 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
33
34 #if defined(__x86_64__) && defined(__ILP32__)
35 SCMP_ARCH_X86,
36 SCMP_ARCH_X86_64,
37 SCMP_ARCH_X32, /* native */
38 #elif defined(__x86_64__) && !defined(__ILP32__)
39 SCMP_ARCH_X86,
40 SCMP_ARCH_X32,
41 SCMP_ARCH_X86_64, /* native */
42 #elif defined(__i386__)
43 SCMP_ARCH_X86,
44 #elif defined(__aarch64__)
45 SCMP_ARCH_ARM,
46 SCMP_ARCH_AARCH64, /* native */
47 #elif defined(__arm__)
48 SCMP_ARCH_ARM,
49 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
50 SCMP_ARCH_MIPSEL,
51 SCMP_ARCH_MIPS, /* native */
52 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL, /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPSEL,
57 SCMP_ARCH_MIPS,
58 SCMP_ARCH_MIPSEL64N32,
59 SCMP_ARCH_MIPS64N32,
60 SCMP_ARCH_MIPSEL64,
61 SCMP_ARCH_MIPS64, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
63 SCMP_ARCH_MIPS,
64 SCMP_ARCH_MIPSEL,
65 SCMP_ARCH_MIPS64N32,
66 SCMP_ARCH_MIPSEL64N32,
67 SCMP_ARCH_MIPS64,
68 SCMP_ARCH_MIPSEL64, /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPSEL,
71 SCMP_ARCH_MIPS,
72 SCMP_ARCH_MIPSEL64,
73 SCMP_ARCH_MIPS64,
74 SCMP_ARCH_MIPSEL64N32,
75 SCMP_ARCH_MIPS64N32, /* native */
76 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
77 SCMP_ARCH_MIPS,
78 SCMP_ARCH_MIPSEL,
79 SCMP_ARCH_MIPS64,
80 SCMP_ARCH_MIPSEL64,
81 SCMP_ARCH_MIPS64N32,
82 SCMP_ARCH_MIPSEL64N32, /* native */
83 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
84 SCMP_ARCH_PPC,
85 SCMP_ARCH_PPC64LE,
86 SCMP_ARCH_PPC64, /* native */
87 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
88 SCMP_ARCH_PPC,
89 SCMP_ARCH_PPC64,
90 SCMP_ARCH_PPC64LE, /* native */
91 #elif defined(__powerpc__)
92 SCMP_ARCH_PPC,
93 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
94 SCMP_ARCH_RISCV64,
95 #elif defined(__s390x__)
96 SCMP_ARCH_S390,
97 SCMP_ARCH_S390X, /* native */
98 #elif defined(__s390__)
99 SCMP_ARCH_S390,
100 #endif
101 SECCOMP_LOCAL_ARCH_END
102 };
103
104 const char* seccomp_arch_to_string(uint32_t c) {
105 /* Maintain order used in <seccomp.h>.
106 *
107 * Names used here should be the same as those used for ConditionArchitecture=,
108 * except for "subarchitectures" like x32. */
109
110 switch (c) {
111 case SCMP_ARCH_NATIVE:
112 return "native";
113 case SCMP_ARCH_X86:
114 return "x86";
115 case SCMP_ARCH_X86_64:
116 return "x86-64";
117 case SCMP_ARCH_X32:
118 return "x32";
119 case SCMP_ARCH_ARM:
120 return "arm";
121 case SCMP_ARCH_AARCH64:
122 return "arm64";
123 case SCMP_ARCH_MIPS:
124 return "mips";
125 case SCMP_ARCH_MIPS64:
126 return "mips64";
127 case SCMP_ARCH_MIPS64N32:
128 return "mips64-n32";
129 case SCMP_ARCH_MIPSEL:
130 return "mips-le";
131 case SCMP_ARCH_MIPSEL64:
132 return "mips64-le";
133 case SCMP_ARCH_MIPSEL64N32:
134 return "mips64-le-n32";
135 case SCMP_ARCH_PPC:
136 return "ppc";
137 case SCMP_ARCH_PPC64:
138 return "ppc64";
139 case SCMP_ARCH_PPC64LE:
140 return "ppc64-le";
141 #ifdef SCMP_ARCH_RISCV64
142 case SCMP_ARCH_RISCV64:
143 return "riscv64";
144 #endif
145 case SCMP_ARCH_S390:
146 return "s390";
147 case SCMP_ARCH_S390X:
148 return "s390x";
149 default:
150 return NULL;
151 }
152 }
153
154 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
155 if (!n)
156 return -EINVAL;
157
158 assert(ret);
159
160 if (streq(n, "native"))
161 *ret = SCMP_ARCH_NATIVE;
162 else if (streq(n, "x86"))
163 *ret = SCMP_ARCH_X86;
164 else if (streq(n, "x86-64"))
165 *ret = SCMP_ARCH_X86_64;
166 else if (streq(n, "x32"))
167 *ret = SCMP_ARCH_X32;
168 else if (streq(n, "arm"))
169 *ret = SCMP_ARCH_ARM;
170 else if (streq(n, "arm64"))
171 *ret = SCMP_ARCH_AARCH64;
172 else if (streq(n, "mips"))
173 *ret = SCMP_ARCH_MIPS;
174 else if (streq(n, "mips64"))
175 *ret = SCMP_ARCH_MIPS64;
176 else if (streq(n, "mips64-n32"))
177 *ret = SCMP_ARCH_MIPS64N32;
178 else if (streq(n, "mips-le"))
179 *ret = SCMP_ARCH_MIPSEL;
180 else if (streq(n, "mips64-le"))
181 *ret = SCMP_ARCH_MIPSEL64;
182 else if (streq(n, "mips64-le-n32"))
183 *ret = SCMP_ARCH_MIPSEL64N32;
184 else if (streq(n, "ppc"))
185 *ret = SCMP_ARCH_PPC;
186 else if (streq(n, "ppc64"))
187 *ret = SCMP_ARCH_PPC64;
188 else if (streq(n, "ppc64-le"))
189 *ret = SCMP_ARCH_PPC64LE;
190 #ifdef SCMP_ARCH_RISCV64
191 else if (streq(n, "riscv64"))
192 *ret = SCMP_ARCH_RISCV64;
193 #endif
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
198 else
199 return -EINVAL;
200
201 return 0;
202 }
203
204 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
205 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
206 int r;
207
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
219 if (r < 0)
220 return r;
221
222 r = seccomp_arch_add(seccomp, arch);
223 if (r < 0)
224 return r;
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
235 if (r < 0)
236 return r;
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
240 return r;
241
242 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
243 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
244 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
245 if (r < 0)
246 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
247 }
248 #endif
249
250 *ret = TAKE_PTR(seccomp);
251 return 0;
252 }
253
254 static bool is_basic_seccomp_available(void) {
255 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
256 }
257
258 static bool is_seccomp_filter_available(void) {
259 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
260 errno == EFAULT;
261 }
262
263 bool is_seccomp_available(void) {
264 static int cached_enabled = -1;
265
266 if (cached_enabled < 0) {
267 int b;
268
269 b = getenv_bool_secure("SYSTEMD_SECCOMP");
270 if (b != 0) {
271 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
272 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
273
274 cached_enabled =
275 is_basic_seccomp_available() &&
276 is_seccomp_filter_available();
277 } else
278 cached_enabled = false;
279 }
280
281 return cached_enabled;
282 }
283
284 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
285 [SYSCALL_FILTER_SET_DEFAULT] = {
286 .name = "@default",
287 .help = "System calls that are always permitted",
288 .value =
289 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
290 "brk\0"
291 "cacheflush\0"
292 "clock_getres\0"
293 "clock_getres_time64\0"
294 "clock_gettime\0"
295 "clock_gettime64\0"
296 "clock_nanosleep\0"
297 "clock_nanosleep_time64\0"
298 "execve\0"
299 "exit\0"
300 "exit_group\0"
301 "futex\0"
302 "futex_time64\0"
303 "get_robust_list\0"
304 "get_thread_area\0"
305 "getegid\0"
306 "getegid32\0"
307 "geteuid\0"
308 "geteuid32\0"
309 "getgid\0"
310 "getgid32\0"
311 "getgroups\0"
312 "getgroups32\0"
313 "getpgid\0"
314 "getpgrp\0"
315 "getpid\0"
316 "getppid\0"
317 "getrandom\0"
318 "getresgid\0"
319 "getresgid32\0"
320 "getresuid\0"
321 "getresuid32\0"
322 "getrlimit\0" /* make sure processes can query stack size and such */
323 "getsid\0"
324 "gettid\0"
325 "gettimeofday\0"
326 "getuid\0"
327 "getuid32\0"
328 "membarrier\0"
329 "mmap\0"
330 "mmap2\0"
331 "mprotect\0"
332 "munmap\0"
333 "nanosleep\0"
334 "pause\0"
335 "prlimit64\0"
336 "restart_syscall\0"
337 "rseq\0"
338 "rt_sigreturn\0"
339 "sched_getaffinity\0"
340 "sched_yield\0"
341 "set_robust_list\0"
342 "set_thread_area\0"
343 "set_tid_address\0"
344 "set_tls\0"
345 "sigreturn\0"
346 "time\0"
347 "ugetrlimit\0"
348 },
349 [SYSCALL_FILTER_SET_AIO] = {
350 .name = "@aio",
351 .help = "Asynchronous IO",
352 .value =
353 "io_cancel\0"
354 "io_destroy\0"
355 "io_getevents\0"
356 "io_pgetevents\0"
357 "io_pgetevents_time64\0"
358 "io_setup\0"
359 "io_submit\0"
360 "io_uring_enter\0"
361 "io_uring_register\0"
362 "io_uring_setup\0"
363 },
364 [SYSCALL_FILTER_SET_BASIC_IO] = {
365 .name = "@basic-io",
366 .help = "Basic IO",
367 .value =
368 "_llseek\0"
369 "close\0"
370 "close_range\0"
371 "dup\0"
372 "dup2\0"
373 "dup3\0"
374 "lseek\0"
375 "pread64\0"
376 "preadv\0"
377 "preadv2\0"
378 "pwrite64\0"
379 "pwritev\0"
380 "pwritev2\0"
381 "read\0"
382 "readv\0"
383 "write\0"
384 "writev\0"
385 },
386 [SYSCALL_FILTER_SET_CHOWN] = {
387 .name = "@chown",
388 .help = "Change ownership of files and directories",
389 .value =
390 "chown\0"
391 "chown32\0"
392 "fchown\0"
393 "fchown32\0"
394 "fchownat\0"
395 "lchown\0"
396 "lchown32\0"
397 },
398 [SYSCALL_FILTER_SET_CLOCK] = {
399 .name = "@clock",
400 .help = "Change the system time",
401 .value =
402 "adjtimex\0"
403 "clock_adjtime\0"
404 "clock_adjtime64\0"
405 "clock_settime\0"
406 "clock_settime64\0"
407 "settimeofday\0"
408 },
409 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
410 .name = "@cpu-emulation",
411 .help = "System calls for CPU emulation functionality",
412 .value =
413 "modify_ldt\0"
414 "subpage_prot\0"
415 "switch_endian\0"
416 "vm86\0"
417 "vm86old\0"
418 },
419 [SYSCALL_FILTER_SET_DEBUG] = {
420 .name = "@debug",
421 .help = "Debugging, performance monitoring and tracing functionality",
422 .value =
423 "lookup_dcookie\0"
424 "perf_event_open\0"
425 "pidfd_getfd\0"
426 "ptrace\0"
427 "rtas\0"
428 #if defined __s390__ || defined __s390x__
429 "s390_runtime_instr\0"
430 #endif
431 "sys_debug_setcontext\0"
432 },
433 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
434 .name = "@file-system",
435 .help = "File system operations",
436 .value =
437 "access\0"
438 "chdir\0"
439 "chmod\0"
440 "close\0"
441 "creat\0"
442 "faccessat\0"
443 "faccessat2\0"
444 "fallocate\0"
445 "fchdir\0"
446 "fchmod\0"
447 "fchmodat\0"
448 "fcntl\0"
449 "fcntl64\0"
450 "fgetxattr\0"
451 "flistxattr\0"
452 "fremovexattr\0"
453 "fsetxattr\0"
454 "fstat\0"
455 "fstat64\0"
456 "fstatat64\0"
457 "fstatfs\0"
458 "fstatfs64\0"
459 "ftruncate\0"
460 "ftruncate64\0"
461 "futimesat\0"
462 "getcwd\0"
463 "getdents\0"
464 "getdents64\0"
465 "getxattr\0"
466 "inotify_add_watch\0"
467 "inotify_init\0"
468 "inotify_init1\0"
469 "inotify_rm_watch\0"
470 "lgetxattr\0"
471 "link\0"
472 "linkat\0"
473 "listxattr\0"
474 "llistxattr\0"
475 "lremovexattr\0"
476 "lsetxattr\0"
477 "lstat\0"
478 "lstat64\0"
479 "mkdir\0"
480 "mkdirat\0"
481 "mknod\0"
482 "mknodat\0"
483 "newfstatat\0"
484 "oldfstat\0"
485 "oldlstat\0"
486 "oldstat\0"
487 "open\0"
488 "openat\0"
489 "openat2\0"
490 "readlink\0"
491 "readlinkat\0"
492 "removexattr\0"
493 "rename\0"
494 "renameat\0"
495 "renameat2\0"
496 "rmdir\0"
497 "setxattr\0"
498 "stat\0"
499 "stat64\0"
500 "statfs\0"
501 "statfs64\0"
502 "statx\0"
503 "symlink\0"
504 "symlinkat\0"
505 "truncate\0"
506 "truncate64\0"
507 "unlink\0"
508 "unlinkat\0"
509 "utime\0"
510 "utimensat\0"
511 "utimensat_time64\0"
512 "utimes\0"
513 },
514 [SYSCALL_FILTER_SET_IO_EVENT] = {
515 .name = "@io-event",
516 .help = "Event loop system calls",
517 .value =
518 "_newselect\0"
519 "epoll_create\0"
520 "epoll_create1\0"
521 "epoll_ctl\0"
522 "epoll_ctl_old\0"
523 "epoll_pwait\0"
524 "epoll_pwait2\0"
525 "epoll_wait\0"
526 "epoll_wait_old\0"
527 "eventfd\0"
528 "eventfd2\0"
529 "poll\0"
530 "ppoll\0"
531 "ppoll_time64\0"
532 "pselect6\0"
533 "pselect6_time64\0"
534 "select\0"
535 },
536 [SYSCALL_FILTER_SET_IPC] = {
537 .name = "@ipc",
538 .help = "SysV IPC, POSIX Message Queues or other IPC",
539 .value =
540 "ipc\0"
541 "memfd_create\0"
542 "mq_getsetattr\0"
543 "mq_notify\0"
544 "mq_open\0"
545 "mq_timedreceive\0"
546 "mq_timedreceive_time64\0"
547 "mq_timedsend\0"
548 "mq_timedsend_time64\0"
549 "mq_unlink\0"
550 "msgctl\0"
551 "msgget\0"
552 "msgrcv\0"
553 "msgsnd\0"
554 "pipe\0"
555 "pipe2\0"
556 "process_madvise\0"
557 "process_vm_readv\0"
558 "process_vm_writev\0"
559 "semctl\0"
560 "semget\0"
561 "semop\0"
562 "semtimedop\0"
563 "semtimedop_time64\0"
564 "shmat\0"
565 "shmctl\0"
566 "shmdt\0"
567 "shmget\0"
568 },
569 [SYSCALL_FILTER_SET_KEYRING] = {
570 .name = "@keyring",
571 .help = "Kernel keyring access",
572 .value =
573 "add_key\0"
574 "keyctl\0"
575 "request_key\0"
576 },
577 [SYSCALL_FILTER_SET_MEMLOCK] = {
578 .name = "@memlock",
579 .help = "Memory locking control",
580 .value =
581 "mlock\0"
582 "mlock2\0"
583 "mlockall\0"
584 "munlock\0"
585 "munlockall\0"
586 },
587 [SYSCALL_FILTER_SET_MODULE] = {
588 .name = "@module",
589 .help = "Loading and unloading of kernel modules",
590 .value =
591 "delete_module\0"
592 "finit_module\0"
593 "init_module\0"
594 },
595 [SYSCALL_FILTER_SET_MOUNT] = {
596 .name = "@mount",
597 .help = "Mounting and unmounting of file systems",
598 .value =
599 "chroot\0"
600 "fsconfig\0"
601 "fsmount\0"
602 "fsopen\0"
603 "fspick\0"
604 "mount\0"
605 "mount_setattr\0"
606 "move_mount\0"
607 "open_tree\0"
608 "pivot_root\0"
609 "umount\0"
610 "umount2\0"
611 },
612 [SYSCALL_FILTER_SET_NETWORK_IO] = {
613 .name = "@network-io",
614 .help = "Network or Unix socket IO, should not be needed if not network facing",
615 .value =
616 "accept\0"
617 "accept4\0"
618 "bind\0"
619 "connect\0"
620 "getpeername\0"
621 "getsockname\0"
622 "getsockopt\0"
623 "listen\0"
624 "recv\0"
625 "recvfrom\0"
626 "recvmmsg\0"
627 "recvmmsg_time64\0"
628 "recvmsg\0"
629 "send\0"
630 "sendmmsg\0"
631 "sendmsg\0"
632 "sendto\0"
633 "setsockopt\0"
634 "shutdown\0"
635 "socket\0"
636 "socketcall\0"
637 "socketpair\0"
638 },
639 [SYSCALL_FILTER_SET_OBSOLETE] = {
640 /* some unknown even to libseccomp */
641 .name = "@obsolete",
642 .help = "Unusual, obsolete or unimplemented system calls",
643 .value =
644 "_sysctl\0"
645 "afs_syscall\0"
646 "bdflush\0"
647 "break\0"
648 "create_module\0"
649 "ftime\0"
650 "get_kernel_syms\0"
651 "getpmsg\0"
652 "gtty\0"
653 "idle\0"
654 "lock\0"
655 "mpx\0"
656 "prof\0"
657 "profil\0"
658 "putpmsg\0"
659 "query_module\0"
660 "security\0"
661 "sgetmask\0"
662 "ssetmask\0"
663 "stime\0"
664 "stty\0"
665 "sysfs\0"
666 "tuxcall\0"
667 "ulimit\0"
668 "uselib\0"
669 "ustat\0"
670 "vserver\0"
671 },
672 [SYSCALL_FILTER_SET_PKEY] = {
673 .name = "@pkey",
674 .help = "System calls used for memory protection keys",
675 .value =
676 "pkey_alloc\0"
677 "pkey_free\0"
678 "pkey_mprotect\0"
679 },
680 [SYSCALL_FILTER_SET_PRIVILEGED] = {
681 .name = "@privileged",
682 .help = "All system calls which need super-user capabilities",
683 .value =
684 "@chown\0"
685 "@clock\0"
686 "@module\0"
687 "@raw-io\0"
688 "@reboot\0"
689 "@swap\0"
690 "_sysctl\0"
691 "acct\0"
692 "bpf\0"
693 "capset\0"
694 "chroot\0"
695 "fanotify_init\0"
696 "fanotify_mark\0"
697 "nfsservctl\0"
698 "open_by_handle_at\0"
699 "pivot_root\0"
700 "quotactl\0"
701 "setdomainname\0"
702 "setfsuid\0"
703 "setfsuid32\0"
704 "setgroups\0"
705 "setgroups32\0"
706 "sethostname\0"
707 "setresuid\0"
708 "setresuid32\0"
709 "setreuid\0"
710 "setreuid32\0"
711 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
712 "setuid32\0"
713 "vhangup\0"
714 },
715 [SYSCALL_FILTER_SET_PROCESS] = {
716 .name = "@process",
717 .help = "Process control, execution, namespacing operations",
718 .value =
719 "capget\0" /* Able to query arbitrary processes */
720 "clone\0"
721 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
722 * implement seccomp, so we don't need to list it at all. C.f.
723 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
724 "clone3\0"
725 "execveat\0"
726 "fork\0"
727 "getrusage\0"
728 "kill\0"
729 "pidfd_open\0"
730 "pidfd_send_signal\0"
731 "prctl\0"
732 "rt_sigqueueinfo\0"
733 "rt_tgsigqueueinfo\0"
734 "setns\0"
735 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
736 "tgkill\0"
737 "times\0"
738 "tkill\0"
739 "unshare\0"
740 "vfork\0"
741 "wait4\0"
742 "waitid\0"
743 "waitpid\0"
744 },
745 [SYSCALL_FILTER_SET_RAW_IO] = {
746 .name = "@raw-io",
747 .help = "Raw I/O port access",
748 .value =
749 "ioperm\0"
750 "iopl\0"
751 "pciconfig_iobase\0"
752 "pciconfig_read\0"
753 "pciconfig_write\0"
754 #if defined __s390__ || defined __s390x__
755 "s390_pci_mmio_read\0"
756 "s390_pci_mmio_write\0"
757 #endif
758 },
759 [SYSCALL_FILTER_SET_REBOOT] = {
760 .name = "@reboot",
761 .help = "Reboot and reboot preparation/kexec",
762 .value =
763 "kexec_file_load\0"
764 "kexec_load\0"
765 "reboot\0"
766 },
767 [SYSCALL_FILTER_SET_RESOURCES] = {
768 .name = "@resources",
769 .help = "Alter resource settings",
770 .value =
771 "ioprio_set\0"
772 "mbind\0"
773 "migrate_pages\0"
774 "move_pages\0"
775 "nice\0"
776 "sched_setaffinity\0"
777 "sched_setattr\0"
778 "sched_setparam\0"
779 "sched_setscheduler\0"
780 "set_mempolicy\0"
781 "setpriority\0"
782 "setrlimit\0"
783 },
784 [SYSCALL_FILTER_SET_SETUID] = {
785 .name = "@setuid",
786 .help = "Operations for changing user/group credentials",
787 .value =
788 "setgid\0"
789 "setgid32\0"
790 "setgroups\0"
791 "setgroups32\0"
792 "setregid\0"
793 "setregid32\0"
794 "setresgid\0"
795 "setresgid32\0"
796 "setresuid\0"
797 "setresuid32\0"
798 "setreuid\0"
799 "setreuid32\0"
800 "setuid\0"
801 "setuid32\0"
802 },
803 [SYSCALL_FILTER_SET_SIGNAL] = {
804 .name = "@signal",
805 .help = "Process signal handling",
806 .value =
807 "rt_sigaction\0"
808 "rt_sigpending\0"
809 "rt_sigprocmask\0"
810 "rt_sigsuspend\0"
811 "rt_sigtimedwait\0"
812 "rt_sigtimedwait_time64\0"
813 "sigaction\0"
814 "sigaltstack\0"
815 "signal\0"
816 "signalfd\0"
817 "signalfd4\0"
818 "sigpending\0"
819 "sigprocmask\0"
820 "sigsuspend\0"
821 },
822 [SYSCALL_FILTER_SET_SWAP] = {
823 .name = "@swap",
824 .help = "Enable/disable swap devices",
825 .value =
826 "swapoff\0"
827 "swapon\0"
828 },
829 [SYSCALL_FILTER_SET_SYNC] = {
830 .name = "@sync",
831 .help = "Synchronize files and memory to storage",
832 .value =
833 "fdatasync\0"
834 "fsync\0"
835 "msync\0"
836 "sync\0"
837 "sync_file_range\0"
838 "sync_file_range2\0"
839 "syncfs\0"
840 },
841 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
842 .name = "@system-service",
843 .help = "General system service operations",
844 .value =
845 "@aio\0"
846 "@basic-io\0"
847 "@chown\0"
848 "@default\0"
849 "@file-system\0"
850 "@io-event\0"
851 "@ipc\0"
852 "@keyring\0"
853 "@memlock\0"
854 "@network-io\0"
855 "@process\0"
856 "@resources\0"
857 "@setuid\0"
858 "@signal\0"
859 "@sync\0"
860 "@timer\0"
861 "capget\0"
862 "capset\0"
863 "copy_file_range\0"
864 "fadvise64\0"
865 "fadvise64_64\0"
866 "flock\0"
867 "get_mempolicy\0"
868 "getcpu\0"
869 "getpriority\0"
870 "ioctl\0"
871 "ioprio_get\0"
872 "kcmp\0"
873 "madvise\0"
874 "mremap\0"
875 "name_to_handle_at\0"
876 "oldolduname\0"
877 "olduname\0"
878 "personality\0"
879 "readahead\0"
880 "readdir\0"
881 "remap_file_pages\0"
882 "sched_get_priority_max\0"
883 "sched_get_priority_min\0"
884 "sched_getattr\0"
885 "sched_getparam\0"
886 "sched_getscheduler\0"
887 "sched_rr_get_interval\0"
888 "sched_rr_get_interval_time64\0"
889 "sched_yield\0"
890 "sendfile\0"
891 "sendfile64\0"
892 "setfsgid\0"
893 "setfsgid32\0"
894 "setfsuid\0"
895 "setfsuid32\0"
896 "setpgid\0"
897 "setsid\0"
898 "splice\0"
899 "sysinfo\0"
900 "tee\0"
901 "umask\0"
902 "uname\0"
903 "userfaultfd\0"
904 "vmsplice\0"
905 },
906 [SYSCALL_FILTER_SET_TIMER] = {
907 .name = "@timer",
908 .help = "Schedule operations by time",
909 .value =
910 "alarm\0"
911 "getitimer\0"
912 "setitimer\0"
913 "timer_create\0"
914 "timer_delete\0"
915 "timer_getoverrun\0"
916 "timer_gettime\0"
917 "timer_gettime64\0"
918 "timer_settime\0"
919 "timer_settime64\0"
920 "timerfd_create\0"
921 "timerfd_gettime\0"
922 "timerfd_gettime64\0"
923 "timerfd_settime\0"
924 "timerfd_settime64\0"
925 "times\0"
926 },
927 [SYSCALL_FILTER_SET_KNOWN] = {
928 .name = "@known",
929 .help = "All known syscalls declared in the kernel",
930 .value =
931 #include "syscall-list.h"
932 },
933 };
934
935 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
936 if (isempty(name) || name[0] != '@')
937 return NULL;
938
939 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
940 if (streq(syscall_filter_sets[i].name, name))
941 return syscall_filter_sets + i;
942
943 return NULL;
944 }
945
946 static int add_syscall_filter_set(
947 scmp_filter_ctx seccomp,
948 const SyscallFilterSet *set,
949 uint32_t action,
950 char **exclude,
951 bool log_missing,
952 char ***added);
953
954 int seccomp_add_syscall_filter_item(
955 scmp_filter_ctx *seccomp,
956 const char *name,
957 uint32_t action,
958 char **exclude,
959 bool log_missing,
960 char ***added) {
961
962 assert(seccomp);
963 assert(name);
964
965 if (strv_contains(exclude, name))
966 return 0;
967
968 /* Any syscalls that are handled are added to the *added strv. The pointer
969 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
970
971 if (name[0] == '@') {
972 const SyscallFilterSet *other;
973
974 other = syscall_filter_set_find(name);
975 if (!other)
976 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
977 "Filter set %s is not known!",
978 name);
979
980 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
981
982 } else {
983 int id, r;
984
985 id = seccomp_syscall_resolve_name(name);
986 if (id == __NR_SCMP_ERROR) {
987 if (log_missing)
988 log_debug("System call %s is not known, ignoring.", name);
989 return 0;
990 }
991
992 r = seccomp_rule_add_exact(seccomp, action, id, 0);
993 if (r < 0) {
994 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
995 bool ignore = r == -EDOM;
996
997 if (!ignore || log_missing)
998 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
999 name, id, ignore ? ", ignoring" : "");
1000 if (!ignore)
1001 return r;
1002 }
1003
1004 if (added) {
1005 r = strv_extend(added, name);
1006 if (r < 0)
1007 return r;
1008 }
1009
1010 return 0;
1011 }
1012 }
1013
1014 static int add_syscall_filter_set(
1015 scmp_filter_ctx seccomp,
1016 const SyscallFilterSet *set,
1017 uint32_t action,
1018 char **exclude,
1019 bool log_missing,
1020 char ***added) {
1021
1022 const char *sys;
1023 int r;
1024
1025 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1026
1027 assert(seccomp);
1028 assert(set);
1029
1030 NULSTR_FOREACH(sys, set->value) {
1031 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1032 if (r < 0)
1033 return r;
1034 }
1035
1036 return 0;
1037 }
1038
1039 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1040 uint32_t arch;
1041 int r;
1042
1043 assert(set);
1044
1045 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1046 * each local arch. */
1047
1048 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1049 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1050
1051 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1052
1053 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1054 if (r < 0)
1055 return r;
1056
1057 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1058 if (r < 0)
1059 return log_debug_errno(r, "Failed to add filter set: %m");
1060
1061 r = seccomp_load(seccomp);
1062 if (ERRNO_IS_SECCOMP_FATAL(r))
1063 return r;
1064 if (r < 0)
1065 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1066 }
1067
1068 return 0;
1069 }
1070
1071 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
1072 uint32_t arch;
1073 int r;
1074
1075 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1076 * of a SyscallFilterSet* table. */
1077
1078 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
1079 return 0;
1080
1081 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1082 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1083 void *syscall_id, *val;
1084
1085 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1086
1087 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1088 if (r < 0)
1089 return r;
1090
1091 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
1092 uint32_t a = action;
1093 int id = PTR_TO_INT(syscall_id) - 1;
1094 int error = PTR_TO_INT(val);
1095
1096 if (error == SECCOMP_ERROR_NUMBER_KILL)
1097 a = scmp_act_kill_process();
1098 #ifdef SCMP_ACT_LOG
1099 else if (action == SCMP_ACT_LOG)
1100 a = SCMP_ACT_LOG;
1101 #endif
1102 else if (error >= 0)
1103 a = SCMP_ACT_ERRNO(error);
1104
1105 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1106 if (r < 0) {
1107 /* If the system call is not known on this architecture, then that's
1108 * fine, let's ignore it */
1109 _cleanup_free_ char *n = NULL;
1110 bool ignore;
1111
1112 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1113 ignore = r == -EDOM;
1114 if (!ignore || log_missing)
1115 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1116 strna(n), id, ignore ? ", ignoring" : "");
1117 if (!ignore)
1118 return r;
1119 }
1120 }
1121
1122 r = seccomp_load(seccomp);
1123 if (ERRNO_IS_SECCOMP_FATAL(r))
1124 return r;
1125 if (r < 0)
1126 log_debug_errno(r, "Failed to install systemc call filter for architecture %s, skipping: %m",
1127 seccomp_arch_to_string(arch));
1128 }
1129
1130 return 0;
1131 }
1132
1133 int seccomp_parse_syscall_filter(
1134 const char *name,
1135 int errno_num,
1136 Hashmap *filter,
1137 SeccompParseFlags flags,
1138 const char *unit,
1139 const char *filename,
1140 unsigned line) {
1141
1142 int r;
1143
1144 assert(name);
1145 assert(filter);
1146
1147 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1148 return -EINVAL;
1149
1150 if (name[0] == '@') {
1151 const SyscallFilterSet *set;
1152 const char *i;
1153
1154 set = syscall_filter_set_find(name);
1155 if (!set) {
1156 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1157 return -EINVAL;
1158
1159 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1160 "Unknown system call group, ignoring: %s", name);
1161 return 0;
1162 }
1163
1164 NULSTR_FOREACH(i, set->value) {
1165 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1166 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1167 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1168 * about them. */
1169 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1170 if (r < 0)
1171 return r;
1172 }
1173 } else {
1174 int id;
1175
1176 id = seccomp_syscall_resolve_name(name);
1177 if (id == __NR_SCMP_ERROR) {
1178 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1179 return -EINVAL;
1180
1181 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1182 "Failed to parse system call, ignoring: %s", name);
1183 return 0;
1184 }
1185
1186 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1187 * it from the list. The entries in allow-list with non-negative error value will be
1188 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1189 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1190 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
1191 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1192 if (r < 0)
1193 switch (r) {
1194 case -ENOMEM:
1195 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
1196 case -EEXIST:
1197 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1198 break;
1199 default:
1200 return r;
1201 }
1202 } else
1203 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1204 }
1205
1206 return 0;
1207 }
1208
1209 int seccomp_restrict_namespaces(unsigned long retain) {
1210 uint32_t arch;
1211 int r;
1212
1213 if (DEBUG_LOGGING) {
1214 _cleanup_free_ char *s = NULL;
1215
1216 (void) namespace_flags_to_string(retain, &s);
1217 log_debug("Restricting namespace to: %s.", strna(s));
1218 }
1219
1220 /* NOOP? */
1221 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1222 return 0;
1223
1224 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1225 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1226
1227 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1228
1229 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1230 if (r < 0)
1231 return r;
1232
1233 /* We cannot filter on individual flags to clone3(), and we need to disable the
1234 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1235 * users shall fall back to clone(), as if on an older kernel.
1236 *
1237 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1238 * https://github.com/moby/moby/issues/42680. */
1239
1240 r = seccomp_rule_add_exact(
1241 seccomp,
1242 SCMP_ACT_ERRNO(ENOSYS),
1243 SCMP_SYS(clone3),
1244 0);
1245 if (r < 0)
1246 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1247
1248 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1249 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1250 * altogether. */
1251 r = seccomp_rule_add_exact(
1252 seccomp,
1253 SCMP_ACT_ERRNO(EPERM),
1254 SCMP_SYS(setns),
1255 0);
1256 else
1257 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1258 * special invocation with a zero flags argument, right here. */
1259 r = seccomp_rule_add_exact(
1260 seccomp,
1261 SCMP_ACT_ERRNO(EPERM),
1262 SCMP_SYS(setns),
1263 1,
1264 SCMP_A1(SCMP_CMP_EQ, 0));
1265 if (r < 0) {
1266 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1267 continue;
1268 }
1269
1270 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
1271 unsigned long f;
1272
1273 f = namespace_flag_map[i].flag;
1274 if (FLAGS_SET(retain, f)) {
1275 log_debug("Permitting %s.", namespace_flag_map[i].name);
1276 continue;
1277 }
1278
1279 log_debug("Blocking %s.", namespace_flag_map[i].name);
1280
1281 r = seccomp_rule_add_exact(
1282 seccomp,
1283 SCMP_ACT_ERRNO(EPERM),
1284 SCMP_SYS(unshare),
1285 1,
1286 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1287 if (r < 0) {
1288 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1289 break;
1290 }
1291
1292 /* On s390/s390x the first two parameters to clone are switched */
1293 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1294 r = seccomp_rule_add_exact(
1295 seccomp,
1296 SCMP_ACT_ERRNO(EPERM),
1297 SCMP_SYS(clone),
1298 1,
1299 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1300 else
1301 r = seccomp_rule_add_exact(
1302 seccomp,
1303 SCMP_ACT_ERRNO(EPERM),
1304 SCMP_SYS(clone),
1305 1,
1306 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1307 if (r < 0) {
1308 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1309 break;
1310 }
1311
1312 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1313 r = seccomp_rule_add_exact(
1314 seccomp,
1315 SCMP_ACT_ERRNO(EPERM),
1316 SCMP_SYS(setns),
1317 1,
1318 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1319 if (r < 0) {
1320 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1321 break;
1322 }
1323 }
1324 }
1325 if (r < 0)
1326 continue;
1327
1328 r = seccomp_load(seccomp);
1329 if (ERRNO_IS_SECCOMP_FATAL(r))
1330 return r;
1331 if (r < 0)
1332 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1333 }
1334
1335 return 0;
1336 }
1337
1338 int seccomp_protect_sysctl(void) {
1339 uint32_t arch;
1340 int r;
1341
1342 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1343 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1344
1345 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1346
1347 if (IN_SET(arch,
1348 SCMP_ARCH_AARCH64,
1349 #ifdef SCMP_ARCH_RISCV64
1350 SCMP_ARCH_RISCV64,
1351 #endif
1352 SCMP_ARCH_X32
1353 ))
1354 /* No _sysctl syscall */
1355 continue;
1356
1357 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1358 if (r < 0)
1359 return r;
1360
1361 r = seccomp_rule_add_exact(
1362 seccomp,
1363 SCMP_ACT_ERRNO(EPERM),
1364 SCMP_SYS(_sysctl),
1365 0);
1366 if (r < 0) {
1367 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1368 continue;
1369 }
1370
1371 r = seccomp_load(seccomp);
1372 if (ERRNO_IS_SECCOMP_FATAL(r))
1373 return r;
1374 if (r < 0)
1375 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1376 }
1377
1378 return 0;
1379 }
1380
1381 int seccomp_protect_syslog(void) {
1382 uint32_t arch;
1383 int r;
1384
1385 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1386 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1387
1388 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1389 if (r < 0)
1390 return r;
1391
1392 r = seccomp_rule_add_exact(
1393 seccomp,
1394 SCMP_ACT_ERRNO(EPERM),
1395 SCMP_SYS(syslog),
1396 0);
1397
1398 if (r < 0) {
1399 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1400 continue;
1401 }
1402
1403 r = seccomp_load(seccomp);
1404 if (ERRNO_IS_SECCOMP_FATAL(r))
1405 return r;
1406 if (r < 0)
1407 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1408 }
1409
1410 return 0;
1411 }
1412
1413 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1414 uint32_t arch;
1415 int r;
1416
1417 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1418 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1419 bool supported;
1420
1421 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1422
1423 switch (arch) {
1424
1425 case SCMP_ARCH_X86_64:
1426 case SCMP_ARCH_X32:
1427 case SCMP_ARCH_ARM:
1428 case SCMP_ARCH_AARCH64:
1429 case SCMP_ARCH_MIPSEL64N32:
1430 case SCMP_ARCH_MIPS64N32:
1431 case SCMP_ARCH_MIPSEL64:
1432 case SCMP_ARCH_MIPS64:
1433 #ifdef SCMP_ARCH_RISCV64
1434 case SCMP_ARCH_RISCV64:
1435 #endif
1436 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1437 supported = true;
1438 break;
1439
1440 case SCMP_ARCH_S390:
1441 case SCMP_ARCH_S390X:
1442 case SCMP_ARCH_X86:
1443 case SCMP_ARCH_MIPSEL:
1444 case SCMP_ARCH_MIPS:
1445 case SCMP_ARCH_PPC:
1446 case SCMP_ARCH_PPC64:
1447 case SCMP_ARCH_PPC64LE:
1448 default:
1449 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1450 * don't know */
1451 supported = false;
1452 break;
1453 }
1454
1455 if (!supported)
1456 continue;
1457
1458 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1459 if (r < 0)
1460 return r;
1461
1462 if (allow_list) {
1463 int first = 0, last = 0;
1464 void *afp;
1465
1466 /* If this is an allow list, we first block the address families that are out of
1467 * range and then everything that is not in the set. First, we find the lowest and
1468 * highest address family in the set. */
1469
1470 SET_FOREACH(afp, address_families) {
1471 int af = PTR_TO_INT(afp);
1472
1473 if (af <= 0 || af >= af_max())
1474 continue;
1475
1476 if (first == 0 || af < first)
1477 first = af;
1478
1479 if (last == 0 || af > last)
1480 last = af;
1481 }
1482
1483 assert((first == 0) == (last == 0));
1484
1485 if (first == 0) {
1486
1487 /* No entries in the valid range, block everything */
1488 r = seccomp_rule_add_exact(
1489 seccomp,
1490 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1491 SCMP_SYS(socket),
1492 0);
1493 if (r < 0) {
1494 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1495 continue;
1496 }
1497
1498 } else {
1499
1500 /* Block everything below the first entry */
1501 r = seccomp_rule_add_exact(
1502 seccomp,
1503 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1504 SCMP_SYS(socket),
1505 1,
1506 SCMP_A0(SCMP_CMP_LT, first));
1507 if (r < 0) {
1508 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1509 continue;
1510 }
1511
1512 /* Block everything above the last entry */
1513 r = seccomp_rule_add_exact(
1514 seccomp,
1515 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1516 SCMP_SYS(socket),
1517 1,
1518 SCMP_A0(SCMP_CMP_GT, last));
1519 if (r < 0) {
1520 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1521 continue;
1522 }
1523
1524 /* Block everything between the first and last entry */
1525 for (int af = 1; af < af_max(); af++) {
1526
1527 if (set_contains(address_families, INT_TO_PTR(af)))
1528 continue;
1529
1530 r = seccomp_rule_add_exact(
1531 seccomp,
1532 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1533 SCMP_SYS(socket),
1534 1,
1535 SCMP_A0(SCMP_CMP_EQ, af));
1536 if (r < 0)
1537 break;
1538 }
1539 if (r < 0) {
1540 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1541 continue;
1542 }
1543 }
1544
1545 } else {
1546 void *af;
1547
1548 /* If this is a deny list, then generate one rule for each address family that are
1549 * then combined in OR checks. */
1550
1551 SET_FOREACH(af, address_families) {
1552 r = seccomp_rule_add_exact(
1553 seccomp,
1554 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1555 SCMP_SYS(socket),
1556 1,
1557 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1558 if (r < 0)
1559 break;
1560 }
1561 if (r < 0) {
1562 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1563 continue;
1564 }
1565 }
1566
1567 r = seccomp_load(seccomp);
1568 if (ERRNO_IS_SECCOMP_FATAL(r))
1569 return r;
1570 if (r < 0)
1571 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1572 }
1573
1574 return 0;
1575 }
1576
1577 int seccomp_restrict_realtime(void) {
1578 static const int permitted_policies[] = {
1579 SCHED_OTHER,
1580 SCHED_BATCH,
1581 SCHED_IDLE,
1582 };
1583
1584 int r, max_policy = 0;
1585 uint32_t arch;
1586 unsigned i;
1587
1588 /* Determine the highest policy constant we want to allow */
1589 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1590 if (permitted_policies[i] > max_policy)
1591 max_policy = permitted_policies[i];
1592
1593 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1594 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1595 int p;
1596
1597 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1598
1599 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1600 if (r < 0)
1601 return r;
1602
1603 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1604 * allow list. */
1605 for (p = 0; p < max_policy; p++) {
1606 bool good = false;
1607
1608 /* Check if this is in the allow list. */
1609 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1610 if (permitted_policies[i] == p) {
1611 good = true;
1612 break;
1613 }
1614
1615 if (good)
1616 continue;
1617
1618 /* Deny this policy */
1619 r = seccomp_rule_add_exact(
1620 seccomp,
1621 SCMP_ACT_ERRNO(EPERM),
1622 SCMP_SYS(sched_setscheduler),
1623 1,
1624 SCMP_A1(SCMP_CMP_EQ, p));
1625 if (r < 0) {
1626 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1627 continue;
1628 }
1629 }
1630
1631 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1632 * are unsigned here, hence no need no check for < 0 values. */
1633 r = seccomp_rule_add_exact(
1634 seccomp,
1635 SCMP_ACT_ERRNO(EPERM),
1636 SCMP_SYS(sched_setscheduler),
1637 1,
1638 SCMP_A1(SCMP_CMP_GT, max_policy));
1639 if (r < 0) {
1640 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1641 continue;
1642 }
1643
1644 r = seccomp_load(seccomp);
1645 if (ERRNO_IS_SECCOMP_FATAL(r))
1646 return r;
1647 if (r < 0)
1648 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1649 }
1650
1651 return 0;
1652 }
1653
1654 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1655 uint32_t arch,
1656 int nr,
1657 unsigned arg_cnt,
1658 const struct scmp_arg_cmp arg) {
1659 int r;
1660
1661 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1662 if (r < 0) {
1663 _cleanup_free_ char *n = NULL;
1664
1665 n = seccomp_syscall_resolve_num_arch(arch, nr);
1666 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1667 strna(n),
1668 seccomp_arch_to_string(arch));
1669 }
1670
1671 return r;
1672 }
1673
1674 /* For known architectures, check that syscalls are indeed defined or not. */
1675 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1676 assert_cc(SCMP_SYS(shmget) > 0);
1677 assert_cc(SCMP_SYS(shmat) > 0);
1678 assert_cc(SCMP_SYS(shmdt) > 0);
1679 #endif
1680
1681 int seccomp_memory_deny_write_execute(void) {
1682 uint32_t arch;
1683 unsigned loaded = 0;
1684
1685 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1686 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1687 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1688
1689 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1690
1691 switch (arch) {
1692
1693 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1694 * We ignore that here, which means there's still a way to get writable/executable
1695 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1696
1697 case SCMP_ARCH_X86:
1698 case SCMP_ARCH_S390:
1699 filter_syscall = SCMP_SYS(mmap2);
1700 block_syscall = SCMP_SYS(mmap);
1701 /* shmat multiplexed, see above */
1702 break;
1703
1704 case SCMP_ARCH_PPC:
1705 case SCMP_ARCH_PPC64:
1706 case SCMP_ARCH_PPC64LE:
1707 case SCMP_ARCH_S390X:
1708 filter_syscall = SCMP_SYS(mmap);
1709 /* shmat multiplexed, see above */
1710 break;
1711
1712 case SCMP_ARCH_ARM:
1713 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1714 shmat_syscall = SCMP_SYS(shmat);
1715 break;
1716
1717 case SCMP_ARCH_X86_64:
1718 case SCMP_ARCH_X32:
1719 case SCMP_ARCH_AARCH64:
1720 #ifdef SCMP_ARCH_RISCV64
1721 case SCMP_ARCH_RISCV64:
1722 #endif
1723 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1724 shmat_syscall = SCMP_SYS(shmat);
1725 break;
1726
1727 /* Please add more definitions here, if you port systemd to other architectures! */
1728
1729 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1730 #warning "Consider adding the right mmap() syscall definitions here!"
1731 #endif
1732 }
1733
1734 /* Can't filter mmap() on this arch, then skip it */
1735 if (filter_syscall == 0)
1736 continue;
1737
1738 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1739 if (r < 0)
1740 return r;
1741
1742 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1743 1,
1744 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1745 if (r < 0)
1746 continue;
1747
1748 if (block_syscall != 0) {
1749 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1750 if (r < 0)
1751 continue;
1752 }
1753
1754 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1755 1,
1756 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1757 if (r < 0)
1758 continue;
1759
1760 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1761 1,
1762 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1763 if (r < 0)
1764 continue;
1765
1766 if (shmat_syscall > 0) {
1767 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1768 1,
1769 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1770 if (r < 0)
1771 continue;
1772 }
1773
1774 r = seccomp_load(seccomp);
1775 if (ERRNO_IS_SECCOMP_FATAL(r))
1776 return r;
1777 if (r < 0)
1778 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1779 seccomp_arch_to_string(arch));
1780 loaded++;
1781 }
1782
1783 if (loaded == 0)
1784 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1785
1786 return loaded;
1787 }
1788
1789 int seccomp_restrict_archs(Set *archs) {
1790 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1791 int r;
1792 bool blocked_new = false;
1793
1794 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1795 * list.
1796 *
1797 * There are some qualifications. However the most important use is to stop processes from bypassing
1798 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1799 * in a non-native architecture. There are no holes in this use case, at least so far. */
1800
1801 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1802 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1803 * to run a program with the restrictions applied. */
1804 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1805 if (!seccomp)
1806 return -ENOMEM;
1807
1808 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1809 uint32_t arch = seccomp_local_archs[i];
1810
1811 /* See above comment, our "native" architecture is never blocked. */
1812 if (arch == seccomp_arch_native())
1813 continue;
1814
1815 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1816 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1817 continue;
1818
1819 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
1820
1821 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1822 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1823 * The important thing is that you can block the old 32-bit x86 syscalls.
1824 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1825 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1826 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1827
1828 if (block) {
1829 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1830 blocked_new = true;
1831 } else {
1832 r = seccomp_arch_add(seccomp, arch);
1833 if (r < 0 && r != -EEXIST)
1834 return r;
1835 }
1836 }
1837
1838 /* All architectures that will be blocked by the seccomp program were
1839 * already blocked. */
1840 if (!blocked_new)
1841 return 0;
1842
1843 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1844 if (r < 0)
1845 return r;
1846
1847 r = seccomp_load(seccomp);
1848 if (ERRNO_IS_SECCOMP_FATAL(r))
1849 return r;
1850 if (r < 0)
1851 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1852
1853 return 0;
1854 }
1855
1856 int parse_syscall_archs(char **l, Set **ret_archs) {
1857 _cleanup_set_free_ Set *archs = NULL;
1858 int r;
1859
1860 assert(l);
1861 assert(ret_archs);
1862
1863 STRV_FOREACH(s, l) {
1864 uint32_t a;
1865
1866 r = seccomp_arch_from_string(*s, &a);
1867 if (r < 0)
1868 return -EINVAL;
1869
1870 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1871 if (r < 0)
1872 return -ENOMEM;
1873 }
1874
1875 *ret_archs = TAKE_PTR(archs);
1876 return 0;
1877 }
1878
1879 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1880 const char *i;
1881 int r;
1882
1883 assert(set);
1884
1885 NULSTR_FOREACH(i, set->value) {
1886
1887 if (i[0] == '@') {
1888 const SyscallFilterSet *more;
1889
1890 more = syscall_filter_set_find(i);
1891 if (!more)
1892 return -ENXIO;
1893
1894 r = seccomp_filter_set_add(filter, add, more);
1895 if (r < 0)
1896 return r;
1897 } else {
1898 int id;
1899
1900 id = seccomp_syscall_resolve_name(i);
1901 if (id == __NR_SCMP_ERROR) {
1902 log_debug("Couldn't resolve system call, ignoring: %s", i);
1903 continue;
1904 }
1905
1906 if (add) {
1907 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1908 if (r < 0)
1909 return r;
1910 } else
1911 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1912 }
1913 }
1914
1915 return 0;
1916 }
1917
1918 int seccomp_lock_personality(unsigned long personality) {
1919 uint32_t arch;
1920 int r;
1921
1922 if (personality >= PERSONALITY_INVALID)
1923 return -EINVAL;
1924
1925 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1926 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1927
1928 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1929 if (r < 0)
1930 return r;
1931
1932 r = seccomp_rule_add_exact(
1933 seccomp,
1934 SCMP_ACT_ERRNO(EPERM),
1935 SCMP_SYS(personality),
1936 1,
1937 SCMP_A0(SCMP_CMP_NE, personality));
1938 if (r < 0) {
1939 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1940 continue;
1941 }
1942
1943 r = seccomp_load(seccomp);
1944 if (ERRNO_IS_SECCOMP_FATAL(r))
1945 return r;
1946 if (r < 0)
1947 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1948 }
1949
1950 return 0;
1951 }
1952
1953 int seccomp_protect_hostname(void) {
1954 uint32_t arch;
1955 int r;
1956
1957 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1958 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1959
1960 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1961 if (r < 0)
1962 return r;
1963
1964 r = seccomp_rule_add_exact(
1965 seccomp,
1966 SCMP_ACT_ERRNO(EPERM),
1967 SCMP_SYS(sethostname),
1968 0);
1969 if (r < 0) {
1970 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1971 continue;
1972 }
1973
1974 r = seccomp_rule_add_exact(
1975 seccomp,
1976 SCMP_ACT_ERRNO(EPERM),
1977 SCMP_SYS(setdomainname),
1978 0);
1979 if (r < 0) {
1980 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1981 continue;
1982 }
1983
1984 r = seccomp_load(seccomp);
1985 if (ERRNO_IS_SECCOMP_FATAL(r))
1986 return r;
1987 if (r < 0)
1988 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1989 }
1990
1991 return 0;
1992 }
1993
1994 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1995 /* Checks the mode_t parameter of the following system calls:
1996 *
1997 * → chmod() + fchmod() + fchmodat()
1998 * → open() + creat() + openat()
1999 * → mkdir() + mkdirat()
2000 * → mknod() + mknodat()
2001 *
2002 * Returns error if *everything* failed, and 0 otherwise.
2003 */
2004 int r;
2005 bool any = false;
2006
2007 r = seccomp_rule_add_exact(
2008 seccomp,
2009 SCMP_ACT_ERRNO(EPERM),
2010 SCMP_SYS(chmod),
2011 1,
2012 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2013 if (r < 0)
2014 log_debug_errno(r, "Failed to add filter for chmod: %m");
2015 else
2016 any = true;
2017
2018 r = seccomp_rule_add_exact(
2019 seccomp,
2020 SCMP_ACT_ERRNO(EPERM),
2021 SCMP_SYS(fchmod),
2022 1,
2023 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2024 if (r < 0)
2025 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2026 else
2027 any = true;
2028
2029 r = seccomp_rule_add_exact(
2030 seccomp,
2031 SCMP_ACT_ERRNO(EPERM),
2032 SCMP_SYS(fchmodat),
2033 1,
2034 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2035 if (r < 0)
2036 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2037 else
2038 any = true;
2039
2040 r = seccomp_rule_add_exact(
2041 seccomp,
2042 SCMP_ACT_ERRNO(EPERM),
2043 SCMP_SYS(mkdir),
2044 1,
2045 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2046 if (r < 0)
2047 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2048 else
2049 any = true;
2050
2051 r = seccomp_rule_add_exact(
2052 seccomp,
2053 SCMP_ACT_ERRNO(EPERM),
2054 SCMP_SYS(mkdirat),
2055 1,
2056 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2057 if (r < 0)
2058 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2059 else
2060 any = true;
2061
2062 r = seccomp_rule_add_exact(
2063 seccomp,
2064 SCMP_ACT_ERRNO(EPERM),
2065 SCMP_SYS(mknod),
2066 1,
2067 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2068 if (r < 0)
2069 log_debug_errno(r, "Failed to add filter for mknod: %m");
2070 else
2071 any = true;
2072
2073 r = seccomp_rule_add_exact(
2074 seccomp,
2075 SCMP_ACT_ERRNO(EPERM),
2076 SCMP_SYS(mknodat),
2077 1,
2078 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2079 if (r < 0)
2080 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2081 else
2082 any = true;
2083
2084 r = seccomp_rule_add_exact(
2085 seccomp,
2086 SCMP_ACT_ERRNO(EPERM),
2087 SCMP_SYS(open),
2088 2,
2089 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2090 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2091 if (r < 0)
2092 log_debug_errno(r, "Failed to add filter for open: %m");
2093 else
2094 any = true;
2095
2096 r = seccomp_rule_add_exact(
2097 seccomp,
2098 SCMP_ACT_ERRNO(EPERM),
2099 SCMP_SYS(openat),
2100 2,
2101 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2102 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2103 if (r < 0)
2104 log_debug_errno(r, "Failed to add filter for openat: %m");
2105 else
2106 any = true;
2107
2108 #if defined(__SNR_openat2)
2109 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2110 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2111 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2112 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2113 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2114 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2115 r = seccomp_rule_add_exact(
2116 seccomp,
2117 SCMP_ACT_ERRNO(ENOSYS),
2118 SCMP_SYS(openat2),
2119 0);
2120 if (r < 0)
2121 log_debug_errno(r, "Failed to add filter for openat2: %m");
2122 else
2123 any = true;
2124 #endif
2125
2126 r = seccomp_rule_add_exact(
2127 seccomp,
2128 SCMP_ACT_ERRNO(EPERM),
2129 SCMP_SYS(creat),
2130 1,
2131 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2132 if (r < 0)
2133 log_debug_errno(r, "Failed to add filter for creat: %m");
2134 else
2135 any = true;
2136
2137 return any ? 0 : r;
2138 }
2139
2140 int seccomp_restrict_suid_sgid(void) {
2141 uint32_t arch;
2142 int r, k;
2143
2144 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2145 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2146
2147 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2148 if (r < 0)
2149 return r;
2150
2151 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2152 if (r < 0)
2153 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2154
2155 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2156 if (k < 0)
2157 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2158
2159 if (r < 0 && k < 0)
2160 continue;
2161
2162 r = seccomp_load(seccomp);
2163 if (ERRNO_IS_SECCOMP_FATAL(r))
2164 return r;
2165 if (r < 0)
2166 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2167 }
2168
2169 return 0;
2170 }
2171
2172 uint32_t scmp_act_kill_process(void) {
2173
2174 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2175 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2176 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2177 * for single-threaded apps does the right thing. */
2178
2179 #ifdef SCMP_ACT_KILL_PROCESS
2180 if (seccomp_api_get() >= 3)
2181 return SCMP_ACT_KILL_PROCESS;
2182 #endif
2183
2184 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2185 }
2186
2187 int parse_syscall_and_errno(const char *in, char **name, int *error) {
2188 _cleanup_free_ char *n = NULL;
2189 char *p;
2190 int e = -1;
2191
2192 assert(in);
2193 assert(name);
2194 assert(error);
2195
2196 /*
2197 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2198 * If errno is omitted, then error is set to -1.
2199 * Empty syscall name is not allowed.
2200 * Here, we do not check that the syscall name is valid or not.
2201 */
2202
2203 p = strchr(in, ':');
2204 if (p) {
2205 e = seccomp_parse_errno_or_action(p + 1);
2206 if (e < 0)
2207 return e;
2208
2209 n = strndup(in, p - in);
2210 } else
2211 n = strdup(in);
2212
2213 if (!n)
2214 return -ENOMEM;
2215
2216 if (isempty(n))
2217 return -EINVAL;
2218
2219 *error = e;
2220 *name = TAKE_PTR(n);
2221
2222 return 0;
2223 }
2224
2225 static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2226 bool any = false;
2227 int r;
2228
2229 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2230 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2231
2232 r = seccomp_rule_add_exact(
2233 seccomp,
2234 SCMP_ACT_ERRNO(EINVAL),
2235 SCMP_SYS(open),
2236 1,
2237 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2238 if (r < 0)
2239 log_debug_errno(r, "Failed to add filter for open: %m");
2240 else
2241 any = true;
2242
2243 r = seccomp_rule_add_exact(
2244 seccomp,
2245 SCMP_ACT_ERRNO(EINVAL),
2246 SCMP_SYS(openat),
2247 1,
2248 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2249 if (r < 0)
2250 log_debug_errno(r, "Failed to add filter for openat: %m");
2251 else
2252 any = true;
2253
2254 #if defined(__SNR_openat2)
2255 /* The new openat2() system call can't be filtered sensibly, see above. */
2256 r = seccomp_rule_add_exact(
2257 seccomp,
2258 SCMP_ACT_ERRNO(ENOSYS),
2259 SCMP_SYS(openat2),
2260 0);
2261 if (r < 0)
2262 log_debug_errno(r, "Failed to add filter for openat2: %m");
2263 else
2264 any = true;
2265 #endif
2266
2267 return any ? 0 : r;
2268 }
2269
2270 int seccomp_suppress_sync(void) {
2271 uint32_t arch;
2272 int r;
2273
2274 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2275 * manageable, and also masks O_SYNC/O_DSYNC */
2276
2277 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2278 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2279 const char *c;
2280
2281 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2282 if (r < 0)
2283 return r;
2284
2285 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2286 int id;
2287
2288 id = seccomp_syscall_resolve_name(c);
2289 if (id == __NR_SCMP_ERROR) {
2290 log_debug("System call %s is not known, ignoring.", c);
2291 continue;
2292 }
2293
2294 r = seccomp_rule_add_exact(
2295 seccomp,
2296 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2297 id,
2298 0);
2299 if (r < 0)
2300 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2301 }
2302
2303 (void) block_open_flag(seccomp, O_SYNC);
2304 #if O_DSYNC != O_SYNC
2305 (void) block_open_flag(seccomp, O_DSYNC);
2306 #endif
2307
2308 r = seccomp_load(seccomp);
2309 if (ERRNO_IS_SECCOMP_FATAL(r))
2310 return r;
2311 if (r < 0)
2312 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2313 }
2314
2315 return 0;
2316 }