]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #24986 from keszybz/news-systemd-measure
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10 #include <sys/stat.h>
11
12 /* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13 #include "missing_syscall_def.h"
14 #include <seccomp.h>
15
16 #include "af-list.h"
17 #include "alloc-util.h"
18 #include "env-util.h"
19 #include "errno-list.h"
20 #include "macro.h"
21 #include "namespace-util.h"
22 #include "nsflags.h"
23 #include "nulstr-util.h"
24 #include "process-util.h"
25 #include "seccomp-util.h"
26 #include "set.h"
27 #include "string-util.h"
28 #include "strv.h"
29
30 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
31 uint32_t seccomp_local_archs[] = {
32
33 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
34
35 #if defined(__x86_64__) && defined(__ILP32__)
36 SCMP_ARCH_X86,
37 SCMP_ARCH_X86_64,
38 SCMP_ARCH_X32, /* native */
39 #elif defined(__x86_64__) && !defined(__ILP32__)
40 SCMP_ARCH_X86,
41 SCMP_ARCH_X32,
42 SCMP_ARCH_X86_64, /* native */
43 #elif defined(__i386__)
44 SCMP_ARCH_X86,
45 #elif defined(__aarch64__)
46 SCMP_ARCH_ARM,
47 SCMP_ARCH_AARCH64, /* native */
48 #elif defined(__arm__)
49 SCMP_ARCH_ARM,
50 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS, /* native */
53 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
54 SCMP_ARCH_MIPS,
55 SCMP_ARCH_MIPSEL, /* native */
56 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
57 SCMP_ARCH_MIPSEL,
58 SCMP_ARCH_MIPS,
59 SCMP_ARCH_MIPSEL64N32,
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64,
62 SCMP_ARCH_MIPS64, /* native */
63 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS64N32,
67 SCMP_ARCH_MIPSEL64N32,
68 SCMP_ARCH_MIPS64,
69 SCMP_ARCH_MIPSEL64, /* native */
70 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL64,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64N32,
76 SCMP_ARCH_MIPS64N32, /* native */
77 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
78 SCMP_ARCH_MIPS,
79 SCMP_ARCH_MIPSEL,
80 SCMP_ARCH_MIPS64,
81 SCMP_ARCH_MIPSEL64,
82 SCMP_ARCH_MIPS64N32,
83 SCMP_ARCH_MIPSEL64N32, /* native */
84 #elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
85 SCMP_ARCH_PARISC,
86 SCMP_ARCH_PARISC64, /* native */
87 #elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
88 SCMP_ARCH_PARISC,
89 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
90 SCMP_ARCH_PPC,
91 SCMP_ARCH_PPC64LE,
92 SCMP_ARCH_PPC64, /* native */
93 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
94 SCMP_ARCH_PPC,
95 SCMP_ARCH_PPC64,
96 SCMP_ARCH_PPC64LE, /* native */
97 #elif defined(__powerpc__)
98 SCMP_ARCH_PPC,
99 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
100 SCMP_ARCH_RISCV64,
101 #elif defined(__s390x__)
102 SCMP_ARCH_S390,
103 SCMP_ARCH_S390X, /* native */
104 #elif defined(__s390__)
105 SCMP_ARCH_S390,
106 #endif
107 SECCOMP_LOCAL_ARCH_END
108 };
109
110 const char* seccomp_arch_to_string(uint32_t c) {
111 /* Maintain order used in <seccomp.h>.
112 *
113 * Names used here should be the same as those used for ConditionArchitecture=,
114 * except for "subarchitectures" like x32. */
115
116 switch (c) {
117 case SCMP_ARCH_NATIVE:
118 return "native";
119 case SCMP_ARCH_X86:
120 return "x86";
121 case SCMP_ARCH_X86_64:
122 return "x86-64";
123 case SCMP_ARCH_X32:
124 return "x32";
125 case SCMP_ARCH_ARM:
126 return "arm";
127 case SCMP_ARCH_AARCH64:
128 return "arm64";
129 case SCMP_ARCH_MIPS:
130 return "mips";
131 case SCMP_ARCH_MIPS64:
132 return "mips64";
133 case SCMP_ARCH_MIPS64N32:
134 return "mips64-n32";
135 case SCMP_ARCH_MIPSEL:
136 return "mips-le";
137 case SCMP_ARCH_MIPSEL64:
138 return "mips64-le";
139 case SCMP_ARCH_MIPSEL64N32:
140 return "mips64-le-n32";
141 #ifdef SCMP_ARCH_PARISC
142 case SCMP_ARCH_PARISC:
143 return "parisc";
144 #endif
145 #ifdef SCMP_ARCH_PARISC64
146 case SCMP_ARCH_PARISC64:
147 return "parisc64";
148 #endif
149 case SCMP_ARCH_PPC:
150 return "ppc";
151 case SCMP_ARCH_PPC64:
152 return "ppc64";
153 case SCMP_ARCH_PPC64LE:
154 return "ppc64-le";
155 #ifdef SCMP_ARCH_RISCV64
156 case SCMP_ARCH_RISCV64:
157 return "riscv64";
158 #endif
159 case SCMP_ARCH_S390:
160 return "s390";
161 case SCMP_ARCH_S390X:
162 return "s390x";
163 default:
164 return NULL;
165 }
166 }
167
168 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
169 if (!n)
170 return -EINVAL;
171
172 assert(ret);
173
174 if (streq(n, "native"))
175 *ret = SCMP_ARCH_NATIVE;
176 else if (streq(n, "x86"))
177 *ret = SCMP_ARCH_X86;
178 else if (streq(n, "x86-64"))
179 *ret = SCMP_ARCH_X86_64;
180 else if (streq(n, "x32"))
181 *ret = SCMP_ARCH_X32;
182 else if (streq(n, "arm"))
183 *ret = SCMP_ARCH_ARM;
184 else if (streq(n, "arm64"))
185 *ret = SCMP_ARCH_AARCH64;
186 else if (streq(n, "mips"))
187 *ret = SCMP_ARCH_MIPS;
188 else if (streq(n, "mips64"))
189 *ret = SCMP_ARCH_MIPS64;
190 else if (streq(n, "mips64-n32"))
191 *ret = SCMP_ARCH_MIPS64N32;
192 else if (streq(n, "mips-le"))
193 *ret = SCMP_ARCH_MIPSEL;
194 else if (streq(n, "mips64-le"))
195 *ret = SCMP_ARCH_MIPSEL64;
196 else if (streq(n, "mips64-le-n32"))
197 *ret = SCMP_ARCH_MIPSEL64N32;
198 #ifdef SCMP_ARCH_PARISC
199 else if (streq(n, "parisc"))
200 *ret = SCMP_ARCH_PARISC;
201 #endif
202 #ifdef SCMP_ARCH_PARISC64
203 else if (streq(n, "parisc64"))
204 *ret = SCMP_ARCH_PARISC64;
205 #endif
206 else if (streq(n, "ppc"))
207 *ret = SCMP_ARCH_PPC;
208 else if (streq(n, "ppc64"))
209 *ret = SCMP_ARCH_PPC64;
210 else if (streq(n, "ppc64-le"))
211 *ret = SCMP_ARCH_PPC64LE;
212 #ifdef SCMP_ARCH_RISCV64
213 else if (streq(n, "riscv64"))
214 *ret = SCMP_ARCH_RISCV64;
215 #endif
216 else if (streq(n, "s390"))
217 *ret = SCMP_ARCH_S390;
218 else if (streq(n, "s390x"))
219 *ret = SCMP_ARCH_S390X;
220 else
221 return -EINVAL;
222
223 return 0;
224 }
225
226 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
227 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
228 int r;
229
230 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
231 * any others. Also, turns off the NNP fiddling. */
232
233 seccomp = seccomp_init(default_action);
234 if (!seccomp)
235 return -ENOMEM;
236
237 if (arch != SCMP_ARCH_NATIVE &&
238 arch != seccomp_arch_native()) {
239
240 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
241 if (r < 0)
242 return r;
243
244 r = seccomp_arch_add(seccomp, arch);
245 if (r < 0)
246 return r;
247
248 assert(seccomp_arch_exist(seccomp, arch) >= 0);
249 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
250 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
251 } else {
252 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
253 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
254 }
255
256 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
257 if (r < 0)
258 return r;
259
260 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
261 if (r < 0)
262 return r;
263
264 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
265 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
266 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
267 if (r < 0)
268 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
269 }
270 #endif
271
272 *ret = TAKE_PTR(seccomp);
273 return 0;
274 }
275
276 static bool is_basic_seccomp_available(void) {
277 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
278 }
279
280 static bool is_seccomp_filter_available(void) {
281 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
282 errno == EFAULT;
283 }
284
285 bool is_seccomp_available(void) {
286 static int cached_enabled = -1;
287
288 if (cached_enabled < 0) {
289 int b;
290
291 b = getenv_bool_secure("SYSTEMD_SECCOMP");
292 if (b != 0) {
293 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
294 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
295
296 cached_enabled =
297 is_basic_seccomp_available() &&
298 is_seccomp_filter_available();
299 } else
300 cached_enabled = false;
301 }
302
303 return cached_enabled;
304 }
305
306 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
307 [SYSCALL_FILTER_SET_DEFAULT] = {
308 .name = "@default",
309 .help = "System calls that are always permitted",
310 .value =
311 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
312 "brk\0"
313 "cacheflush\0"
314 "clock_getres\0"
315 "clock_getres_time64\0"
316 "clock_gettime\0"
317 "clock_gettime64\0"
318 "clock_nanosleep\0"
319 "clock_nanosleep_time64\0"
320 "execve\0"
321 "exit\0"
322 "exit_group\0"
323 "futex\0"
324 "futex_time64\0"
325 "get_robust_list\0"
326 "get_thread_area\0"
327 "getegid\0"
328 "getegid32\0"
329 "geteuid\0"
330 "geteuid32\0"
331 "getgid\0"
332 "getgid32\0"
333 "getgroups\0"
334 "getgroups32\0"
335 "getpgid\0"
336 "getpgrp\0"
337 "getpid\0"
338 "getppid\0"
339 "getrandom\0"
340 "getresgid\0"
341 "getresgid32\0"
342 "getresuid\0"
343 "getresuid32\0"
344 "getrlimit\0" /* make sure processes can query stack size and such */
345 "getsid\0"
346 "gettid\0"
347 "gettimeofday\0"
348 "getuid\0"
349 "getuid32\0"
350 "membarrier\0"
351 "mmap\0"
352 "mmap2\0"
353 "mprotect\0"
354 "munmap\0"
355 "nanosleep\0"
356 "pause\0"
357 "prlimit64\0"
358 "restart_syscall\0"
359 "riscv_flush_icache\0"
360 "rseq\0"
361 "rt_sigreturn\0"
362 "sched_getaffinity\0"
363 "sched_yield\0"
364 "set_robust_list\0"
365 "set_thread_area\0"
366 "set_tid_address\0"
367 "set_tls\0"
368 "sigreturn\0"
369 "time\0"
370 "ugetrlimit\0"
371 },
372 [SYSCALL_FILTER_SET_AIO] = {
373 .name = "@aio",
374 .help = "Asynchronous IO",
375 .value =
376 "io_cancel\0"
377 "io_destroy\0"
378 "io_getevents\0"
379 "io_pgetevents\0"
380 "io_pgetevents_time64\0"
381 "io_setup\0"
382 "io_submit\0"
383 "io_uring_enter\0"
384 "io_uring_register\0"
385 "io_uring_setup\0"
386 },
387 [SYSCALL_FILTER_SET_BASIC_IO] = {
388 .name = "@basic-io",
389 .help = "Basic IO",
390 .value =
391 "_llseek\0"
392 "close\0"
393 "close_range\0"
394 "dup\0"
395 "dup2\0"
396 "dup3\0"
397 "lseek\0"
398 "pread64\0"
399 "preadv\0"
400 "preadv2\0"
401 "pwrite64\0"
402 "pwritev\0"
403 "pwritev2\0"
404 "read\0"
405 "readv\0"
406 "write\0"
407 "writev\0"
408 },
409 [SYSCALL_FILTER_SET_CHOWN] = {
410 .name = "@chown",
411 .help = "Change ownership of files and directories",
412 .value =
413 "chown\0"
414 "chown32\0"
415 "fchown\0"
416 "fchown32\0"
417 "fchownat\0"
418 "lchown\0"
419 "lchown32\0"
420 },
421 [SYSCALL_FILTER_SET_CLOCK] = {
422 .name = "@clock",
423 .help = "Change the system time",
424 .value =
425 "adjtimex\0"
426 "clock_adjtime\0"
427 "clock_adjtime64\0"
428 "clock_settime\0"
429 "clock_settime64\0"
430 "settimeofday\0"
431 },
432 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
433 .name = "@cpu-emulation",
434 .help = "System calls for CPU emulation functionality",
435 .value =
436 "modify_ldt\0"
437 "subpage_prot\0"
438 "switch_endian\0"
439 "vm86\0"
440 "vm86old\0"
441 },
442 [SYSCALL_FILTER_SET_DEBUG] = {
443 .name = "@debug",
444 .help = "Debugging, performance monitoring and tracing functionality",
445 .value =
446 "lookup_dcookie\0"
447 "perf_event_open\0"
448 "pidfd_getfd\0"
449 "ptrace\0"
450 "rtas\0"
451 "s390_runtime_instr\0"
452 "sys_debug_setcontext\0"
453 },
454 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
455 .name = "@file-system",
456 .help = "File system operations",
457 .value =
458 "access\0"
459 "chdir\0"
460 "chmod\0"
461 "close\0"
462 "creat\0"
463 "faccessat\0"
464 "faccessat2\0"
465 "fallocate\0"
466 "fchdir\0"
467 "fchmod\0"
468 "fchmodat\0"
469 "fcntl\0"
470 "fcntl64\0"
471 "fgetxattr\0"
472 "flistxattr\0"
473 "fremovexattr\0"
474 "fsetxattr\0"
475 "fstat\0"
476 "fstat64\0"
477 "fstatat64\0"
478 "fstatfs\0"
479 "fstatfs64\0"
480 "ftruncate\0"
481 "ftruncate64\0"
482 "futimesat\0"
483 "getcwd\0"
484 "getdents\0"
485 "getdents64\0"
486 "getxattr\0"
487 "inotify_add_watch\0"
488 "inotify_init\0"
489 "inotify_init1\0"
490 "inotify_rm_watch\0"
491 "lgetxattr\0"
492 "link\0"
493 "linkat\0"
494 "listxattr\0"
495 "llistxattr\0"
496 "lremovexattr\0"
497 "lsetxattr\0"
498 "lstat\0"
499 "lstat64\0"
500 "mkdir\0"
501 "mkdirat\0"
502 "mknod\0"
503 "mknodat\0"
504 "newfstatat\0"
505 "oldfstat\0"
506 "oldlstat\0"
507 "oldstat\0"
508 "open\0"
509 "openat\0"
510 "openat2\0"
511 "readlink\0"
512 "readlinkat\0"
513 "removexattr\0"
514 "rename\0"
515 "renameat\0"
516 "renameat2\0"
517 "rmdir\0"
518 "setxattr\0"
519 "stat\0"
520 "stat64\0"
521 "statfs\0"
522 "statfs64\0"
523 "statx\0"
524 "symlink\0"
525 "symlinkat\0"
526 "truncate\0"
527 "truncate64\0"
528 "unlink\0"
529 "unlinkat\0"
530 "utime\0"
531 "utimensat\0"
532 "utimensat_time64\0"
533 "utimes\0"
534 },
535 [SYSCALL_FILTER_SET_IO_EVENT] = {
536 .name = "@io-event",
537 .help = "Event loop system calls",
538 .value =
539 "_newselect\0"
540 "epoll_create\0"
541 "epoll_create1\0"
542 "epoll_ctl\0"
543 "epoll_ctl_old\0"
544 "epoll_pwait\0"
545 "epoll_pwait2\0"
546 "epoll_wait\0"
547 "epoll_wait_old\0"
548 "eventfd\0"
549 "eventfd2\0"
550 "poll\0"
551 "ppoll\0"
552 "ppoll_time64\0"
553 "pselect6\0"
554 "pselect6_time64\0"
555 "select\0"
556 },
557 [SYSCALL_FILTER_SET_IPC] = {
558 .name = "@ipc",
559 .help = "SysV IPC, POSIX Message Queues or other IPC",
560 .value =
561 "ipc\0"
562 "memfd_create\0"
563 "mq_getsetattr\0"
564 "mq_notify\0"
565 "mq_open\0"
566 "mq_timedreceive\0"
567 "mq_timedreceive_time64\0"
568 "mq_timedsend\0"
569 "mq_timedsend_time64\0"
570 "mq_unlink\0"
571 "msgctl\0"
572 "msgget\0"
573 "msgrcv\0"
574 "msgsnd\0"
575 "pipe\0"
576 "pipe2\0"
577 "process_madvise\0"
578 "process_vm_readv\0"
579 "process_vm_writev\0"
580 "semctl\0"
581 "semget\0"
582 "semop\0"
583 "semtimedop\0"
584 "semtimedop_time64\0"
585 "shmat\0"
586 "shmctl\0"
587 "shmdt\0"
588 "shmget\0"
589 },
590 [SYSCALL_FILTER_SET_KEYRING] = {
591 .name = "@keyring",
592 .help = "Kernel keyring access",
593 .value =
594 "add_key\0"
595 "keyctl\0"
596 "request_key\0"
597 },
598 [SYSCALL_FILTER_SET_MEMLOCK] = {
599 .name = "@memlock",
600 .help = "Memory locking control",
601 .value =
602 "mlock\0"
603 "mlock2\0"
604 "mlockall\0"
605 "munlock\0"
606 "munlockall\0"
607 },
608 [SYSCALL_FILTER_SET_MODULE] = {
609 .name = "@module",
610 .help = "Loading and unloading of kernel modules",
611 .value =
612 "delete_module\0"
613 "finit_module\0"
614 "init_module\0"
615 },
616 [SYSCALL_FILTER_SET_MOUNT] = {
617 .name = "@mount",
618 .help = "Mounting and unmounting of file systems",
619 .value =
620 "chroot\0"
621 "fsconfig\0"
622 "fsmount\0"
623 "fsopen\0"
624 "fspick\0"
625 "mount\0"
626 "mount_setattr\0"
627 "move_mount\0"
628 "open_tree\0"
629 "pivot_root\0"
630 "umount\0"
631 "umount2\0"
632 },
633 [SYSCALL_FILTER_SET_NETWORK_IO] = {
634 .name = "@network-io",
635 .help = "Network or Unix socket IO, should not be needed if not network facing",
636 .value =
637 "accept\0"
638 "accept4\0"
639 "bind\0"
640 "connect\0"
641 "getpeername\0"
642 "getsockname\0"
643 "getsockopt\0"
644 "listen\0"
645 "recv\0"
646 "recvfrom\0"
647 "recvmmsg\0"
648 "recvmmsg_time64\0"
649 "recvmsg\0"
650 "send\0"
651 "sendmmsg\0"
652 "sendmsg\0"
653 "sendto\0"
654 "setsockopt\0"
655 "shutdown\0"
656 "socket\0"
657 "socketcall\0"
658 "socketpair\0"
659 },
660 [SYSCALL_FILTER_SET_OBSOLETE] = {
661 /* some unknown even to libseccomp */
662 .name = "@obsolete",
663 .help = "Unusual, obsolete or unimplemented system calls",
664 .value =
665 "_sysctl\0"
666 "afs_syscall\0"
667 "bdflush\0"
668 "break\0"
669 "create_module\0"
670 "ftime\0"
671 "get_kernel_syms\0"
672 "getpmsg\0"
673 "gtty\0"
674 "idle\0"
675 "lock\0"
676 "mpx\0"
677 "prof\0"
678 "profil\0"
679 "putpmsg\0"
680 "query_module\0"
681 "security\0"
682 "sgetmask\0"
683 "ssetmask\0"
684 "stime\0"
685 "stty\0"
686 "sysfs\0"
687 "tuxcall\0"
688 "ulimit\0"
689 "uselib\0"
690 "ustat\0"
691 "vserver\0"
692 },
693 [SYSCALL_FILTER_SET_PKEY] = {
694 .name = "@pkey",
695 .help = "System calls used for memory protection keys",
696 .value =
697 "pkey_alloc\0"
698 "pkey_free\0"
699 "pkey_mprotect\0"
700 },
701 [SYSCALL_FILTER_SET_PRIVILEGED] = {
702 .name = "@privileged",
703 .help = "All system calls which need super-user capabilities",
704 .value =
705 "@chown\0"
706 "@clock\0"
707 "@module\0"
708 "@raw-io\0"
709 "@reboot\0"
710 "@swap\0"
711 "_sysctl\0"
712 "acct\0"
713 "bpf\0"
714 "capset\0"
715 "chroot\0"
716 "fanotify_init\0"
717 "fanotify_mark\0"
718 "nfsservctl\0"
719 "open_by_handle_at\0"
720 "pivot_root\0"
721 "quotactl\0"
722 "setdomainname\0"
723 "setfsuid\0"
724 "setfsuid32\0"
725 "setgroups\0"
726 "setgroups32\0"
727 "sethostname\0"
728 "setresuid\0"
729 "setresuid32\0"
730 "setreuid\0"
731 "setreuid32\0"
732 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
733 "setuid32\0"
734 "vhangup\0"
735 },
736 [SYSCALL_FILTER_SET_PROCESS] = {
737 .name = "@process",
738 .help = "Process control, execution, namespacing operations",
739 .value =
740 "capget\0" /* Able to query arbitrary processes */
741 "clone\0"
742 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
743 * implement seccomp, so we don't need to list it at all. C.f.
744 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
745 "clone3\0"
746 "execveat\0"
747 "fork\0"
748 "getrusage\0"
749 "kill\0"
750 "pidfd_open\0"
751 "pidfd_send_signal\0"
752 "prctl\0"
753 "rt_sigqueueinfo\0"
754 "rt_tgsigqueueinfo\0"
755 "setns\0"
756 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
757 "tgkill\0"
758 "times\0"
759 "tkill\0"
760 "unshare\0"
761 "vfork\0"
762 "wait4\0"
763 "waitid\0"
764 "waitpid\0"
765 },
766 [SYSCALL_FILTER_SET_RAW_IO] = {
767 .name = "@raw-io",
768 .help = "Raw I/O port access",
769 .value =
770 "ioperm\0"
771 "iopl\0"
772 "pciconfig_iobase\0"
773 "pciconfig_read\0"
774 "pciconfig_write\0"
775 "s390_pci_mmio_read\0"
776 "s390_pci_mmio_write\0"
777 },
778 [SYSCALL_FILTER_SET_REBOOT] = {
779 .name = "@reboot",
780 .help = "Reboot and reboot preparation/kexec",
781 .value =
782 "kexec_file_load\0"
783 "kexec_load\0"
784 "reboot\0"
785 },
786 [SYSCALL_FILTER_SET_RESOURCES] = {
787 .name = "@resources",
788 .help = "Alter resource settings",
789 .value =
790 "ioprio_set\0"
791 "mbind\0"
792 "migrate_pages\0"
793 "move_pages\0"
794 "nice\0"
795 "sched_setaffinity\0"
796 "sched_setattr\0"
797 "sched_setparam\0"
798 "sched_setscheduler\0"
799 "set_mempolicy\0"
800 "setpriority\0"
801 "setrlimit\0"
802 },
803 [SYSCALL_FILTER_SET_SETUID] = {
804 .name = "@setuid",
805 .help = "Operations for changing user/group credentials",
806 .value =
807 "setgid\0"
808 "setgid32\0"
809 "setgroups\0"
810 "setgroups32\0"
811 "setregid\0"
812 "setregid32\0"
813 "setresgid\0"
814 "setresgid32\0"
815 "setresuid\0"
816 "setresuid32\0"
817 "setreuid\0"
818 "setreuid32\0"
819 "setuid\0"
820 "setuid32\0"
821 },
822 [SYSCALL_FILTER_SET_SIGNAL] = {
823 .name = "@signal",
824 .help = "Process signal handling",
825 .value =
826 "rt_sigaction\0"
827 "rt_sigpending\0"
828 "rt_sigprocmask\0"
829 "rt_sigsuspend\0"
830 "rt_sigtimedwait\0"
831 "rt_sigtimedwait_time64\0"
832 "sigaction\0"
833 "sigaltstack\0"
834 "signal\0"
835 "signalfd\0"
836 "signalfd4\0"
837 "sigpending\0"
838 "sigprocmask\0"
839 "sigsuspend\0"
840 },
841 [SYSCALL_FILTER_SET_SWAP] = {
842 .name = "@swap",
843 .help = "Enable/disable swap devices",
844 .value =
845 "swapoff\0"
846 "swapon\0"
847 },
848 [SYSCALL_FILTER_SET_SYNC] = {
849 .name = "@sync",
850 .help = "Synchronize files and memory to storage",
851 .value =
852 "fdatasync\0"
853 "fsync\0"
854 "msync\0"
855 "sync\0"
856 "sync_file_range\0"
857 "sync_file_range2\0"
858 "syncfs\0"
859 },
860 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
861 .name = "@system-service",
862 .help = "General system service operations",
863 .value =
864 "@aio\0"
865 "@basic-io\0"
866 "@chown\0"
867 "@default\0"
868 "@file-system\0"
869 "@io-event\0"
870 "@ipc\0"
871 "@keyring\0"
872 "@memlock\0"
873 "@network-io\0"
874 "@process\0"
875 "@resources\0"
876 "@setuid\0"
877 "@signal\0"
878 "@sync\0"
879 "@timer\0"
880 "capget\0"
881 "capset\0"
882 "copy_file_range\0"
883 "fadvise64\0"
884 "fadvise64_64\0"
885 "flock\0"
886 "get_mempolicy\0"
887 "getcpu\0"
888 "getpriority\0"
889 "ioctl\0"
890 "ioprio_get\0"
891 "kcmp\0"
892 "madvise\0"
893 "mremap\0"
894 "name_to_handle_at\0"
895 "oldolduname\0"
896 "olduname\0"
897 "personality\0"
898 "readahead\0"
899 "readdir\0"
900 "remap_file_pages\0"
901 "sched_get_priority_max\0"
902 "sched_get_priority_min\0"
903 "sched_getattr\0"
904 "sched_getparam\0"
905 "sched_getscheduler\0"
906 "sched_rr_get_interval\0"
907 "sched_rr_get_interval_time64\0"
908 "sched_yield\0"
909 "sendfile\0"
910 "sendfile64\0"
911 "setfsgid\0"
912 "setfsgid32\0"
913 "setfsuid\0"
914 "setfsuid32\0"
915 "setpgid\0"
916 "setsid\0"
917 "splice\0"
918 "sysinfo\0"
919 "tee\0"
920 "umask\0"
921 "uname\0"
922 "userfaultfd\0"
923 "vmsplice\0"
924 },
925 [SYSCALL_FILTER_SET_TIMER] = {
926 .name = "@timer",
927 .help = "Schedule operations by time",
928 .value =
929 "alarm\0"
930 "getitimer\0"
931 "setitimer\0"
932 "timer_create\0"
933 "timer_delete\0"
934 "timer_getoverrun\0"
935 "timer_gettime\0"
936 "timer_gettime64\0"
937 "timer_settime\0"
938 "timer_settime64\0"
939 "timerfd_create\0"
940 "timerfd_gettime\0"
941 "timerfd_gettime64\0"
942 "timerfd_settime\0"
943 "timerfd_settime64\0"
944 "times\0"
945 },
946 [SYSCALL_FILTER_SET_KNOWN] = {
947 .name = "@known",
948 .help = "All known syscalls declared in the kernel",
949 .value =
950 "@obsolete\0"
951 #include "syscall-list.h"
952 },
953 };
954
955 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
956 if (isempty(name) || name[0] != '@')
957 return NULL;
958
959 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
960 if (streq(syscall_filter_sets[i].name, name))
961 return syscall_filter_sets + i;
962
963 return NULL;
964 }
965
966 static int add_syscall_filter_set(
967 scmp_filter_ctx seccomp,
968 const SyscallFilterSet *set,
969 uint32_t action,
970 char **exclude,
971 bool log_missing,
972 char ***added);
973
974 int seccomp_add_syscall_filter_item(
975 scmp_filter_ctx *seccomp,
976 const char *name,
977 uint32_t action,
978 char **exclude,
979 bool log_missing,
980 char ***added) {
981
982 assert(seccomp);
983 assert(name);
984
985 if (strv_contains(exclude, name))
986 return 0;
987
988 /* Any syscalls that are handled are added to the *added strv. The pointer
989 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
990
991 if (name[0] == '@') {
992 const SyscallFilterSet *other;
993
994 other = syscall_filter_set_find(name);
995 if (!other)
996 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
997 "Filter set %s is not known!",
998 name);
999
1000 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
1001
1002 } else {
1003 int id, r;
1004
1005 id = seccomp_syscall_resolve_name(name);
1006 if (id == __NR_SCMP_ERROR) {
1007 if (log_missing)
1008 log_debug("System call %s is not known, ignoring.", name);
1009 return 0;
1010 }
1011
1012 r = seccomp_rule_add_exact(seccomp, action, id, 0);
1013 if (r < 0) {
1014 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1015 bool ignore = r == -EDOM;
1016
1017 if (!ignore || log_missing)
1018 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1019 name, id, ignore ? ", ignoring" : "");
1020 if (!ignore)
1021 return r;
1022 }
1023
1024 if (added) {
1025 r = strv_extend(added, name);
1026 if (r < 0)
1027 return r;
1028 }
1029
1030 return 0;
1031 }
1032 }
1033
1034 static int add_syscall_filter_set(
1035 scmp_filter_ctx seccomp,
1036 const SyscallFilterSet *set,
1037 uint32_t action,
1038 char **exclude,
1039 bool log_missing,
1040 char ***added) {
1041
1042 const char *sys;
1043 int r;
1044
1045 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1046
1047 assert(seccomp);
1048 assert(set);
1049
1050 NULSTR_FOREACH(sys, set->value) {
1051 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1052 if (r < 0)
1053 return r;
1054 }
1055
1056 return 0;
1057 }
1058
1059 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1060 uint32_t arch;
1061 int r;
1062
1063 assert(set);
1064
1065 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1066 * each local arch. */
1067
1068 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1069 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1070
1071 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1072
1073 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1074 if (r < 0)
1075 return r;
1076
1077 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1078 if (r < 0)
1079 return log_debug_errno(r, "Failed to add filter set: %m");
1080
1081 r = seccomp_load(seccomp);
1082 if (ERRNO_IS_SECCOMP_FATAL(r))
1083 return r;
1084 if (r < 0)
1085 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1086 }
1087
1088 return 0;
1089 }
1090
1091 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
1092 uint32_t arch;
1093 int r;
1094
1095 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1096 * of a SyscallFilterSet* table. */
1097
1098 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
1099 return 0;
1100
1101 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1102 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1103 void *syscall_id, *val;
1104
1105 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1106
1107 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1108 if (r < 0)
1109 return r;
1110
1111 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
1112 uint32_t a = action;
1113 int id = PTR_TO_INT(syscall_id) - 1;
1114 int error = PTR_TO_INT(val);
1115
1116 if (error == SECCOMP_ERROR_NUMBER_KILL)
1117 a = scmp_act_kill_process();
1118 #ifdef SCMP_ACT_LOG
1119 else if (action == SCMP_ACT_LOG)
1120 a = SCMP_ACT_LOG;
1121 #endif
1122 else if (error >= 0)
1123 a = SCMP_ACT_ERRNO(error);
1124
1125 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1126 if (r < 0) {
1127 /* If the system call is not known on this architecture, then that's
1128 * fine, let's ignore it */
1129 _cleanup_free_ char *n = NULL;
1130 bool ignore;
1131
1132 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1133 ignore = r == -EDOM;
1134 if (!ignore || log_missing)
1135 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1136 strna(n), id, ignore ? ", ignoring" : "");
1137 if (!ignore)
1138 return r;
1139 }
1140 }
1141
1142 r = seccomp_load(seccomp);
1143 if (ERRNO_IS_SECCOMP_FATAL(r))
1144 return r;
1145 if (r < 0)
1146 log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
1147 seccomp_arch_to_string(arch));
1148 }
1149
1150 return 0;
1151 }
1152
1153 int seccomp_parse_syscall_filter(
1154 const char *name,
1155 int errno_num,
1156 Hashmap *filter,
1157 SeccompParseFlags flags,
1158 const char *unit,
1159 const char *filename,
1160 unsigned line) {
1161
1162 int r;
1163
1164 assert(name);
1165 assert(filter);
1166
1167 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1168 return -EINVAL;
1169
1170 if (name[0] == '@') {
1171 const SyscallFilterSet *set;
1172 const char *i;
1173
1174 set = syscall_filter_set_find(name);
1175 if (!set) {
1176 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1177 return -EINVAL;
1178
1179 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1180 "Unknown system call group, ignoring: %s", name);
1181 return 0;
1182 }
1183
1184 NULSTR_FOREACH(i, set->value) {
1185 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1186 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1187 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1188 * about them. */
1189 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1190 if (r < 0)
1191 return r;
1192 }
1193 } else {
1194 int id;
1195
1196 id = seccomp_syscall_resolve_name(name);
1197 if (id == __NR_SCMP_ERROR) {
1198 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1199 return -EINVAL;
1200
1201 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1202 "Failed to parse system call, ignoring: %s", name);
1203 return 0;
1204 }
1205
1206 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1207 * it from the list. The entries in allow-list with non-negative error value will be
1208 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1209 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1210 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
1211 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1212 if (r < 0)
1213 switch (r) {
1214 case -ENOMEM:
1215 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
1216 case -EEXIST:
1217 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1218 break;
1219 default:
1220 return r;
1221 }
1222 } else
1223 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1224 }
1225
1226 return 0;
1227 }
1228
1229 int seccomp_restrict_namespaces(unsigned long retain) {
1230 uint32_t arch;
1231 int r;
1232
1233 if (DEBUG_LOGGING) {
1234 _cleanup_free_ char *s = NULL;
1235
1236 (void) namespace_flags_to_string(retain, &s);
1237 log_debug("Restricting namespace to: %s.", strna(s));
1238 }
1239
1240 /* NOOP? */
1241 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1242 return 0;
1243
1244 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1245 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1246
1247 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1248
1249 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1250 if (r < 0)
1251 return r;
1252
1253 /* We cannot filter on individual flags to clone3(), and we need to disable the
1254 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1255 * users shall fall back to clone(), as if on an older kernel.
1256 *
1257 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1258 * https://github.com/moby/moby/issues/42680. */
1259
1260 r = seccomp_rule_add_exact(
1261 seccomp,
1262 SCMP_ACT_ERRNO(ENOSYS),
1263 SCMP_SYS(clone3),
1264 0);
1265 if (r < 0)
1266 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1267
1268 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1269 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1270 * altogether. */
1271 r = seccomp_rule_add_exact(
1272 seccomp,
1273 SCMP_ACT_ERRNO(EPERM),
1274 SCMP_SYS(setns),
1275 0);
1276 else
1277 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1278 * special invocation with a zero flags argument, right here. */
1279 r = seccomp_rule_add_exact(
1280 seccomp,
1281 SCMP_ACT_ERRNO(EPERM),
1282 SCMP_SYS(setns),
1283 1,
1284 SCMP_A1(SCMP_CMP_EQ, 0));
1285 if (r < 0) {
1286 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1287 continue;
1288 }
1289
1290 for (unsigned i = 0; namespace_info[i].proc_name; i++) {
1291 unsigned long f;
1292
1293 f = namespace_info[i].clone_flag;
1294 if (FLAGS_SET(retain, f)) {
1295 log_debug("Permitting %s.", namespace_info[i].proc_name);
1296 continue;
1297 }
1298
1299 log_debug("Blocking %s.", namespace_info[i].proc_name);
1300
1301 r = seccomp_rule_add_exact(
1302 seccomp,
1303 SCMP_ACT_ERRNO(EPERM),
1304 SCMP_SYS(unshare),
1305 1,
1306 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1307 if (r < 0) {
1308 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1309 break;
1310 }
1311
1312 /* On s390/s390x the first two parameters to clone are switched */
1313 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1314 r = seccomp_rule_add_exact(
1315 seccomp,
1316 SCMP_ACT_ERRNO(EPERM),
1317 SCMP_SYS(clone),
1318 1,
1319 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1320 else
1321 r = seccomp_rule_add_exact(
1322 seccomp,
1323 SCMP_ACT_ERRNO(EPERM),
1324 SCMP_SYS(clone),
1325 1,
1326 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1327 if (r < 0) {
1328 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1329 break;
1330 }
1331
1332 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1333 r = seccomp_rule_add_exact(
1334 seccomp,
1335 SCMP_ACT_ERRNO(EPERM),
1336 SCMP_SYS(setns),
1337 1,
1338 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1339 if (r < 0) {
1340 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1341 break;
1342 }
1343 }
1344 }
1345 if (r < 0)
1346 continue;
1347
1348 r = seccomp_load(seccomp);
1349 if (ERRNO_IS_SECCOMP_FATAL(r))
1350 return r;
1351 if (r < 0)
1352 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1353 }
1354
1355 return 0;
1356 }
1357
1358 int seccomp_protect_sysctl(void) {
1359 uint32_t arch;
1360 int r;
1361
1362 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1363 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1364
1365 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1366
1367 if (IN_SET(arch,
1368 SCMP_ARCH_AARCH64,
1369 #ifdef SCMP_ARCH_RISCV64
1370 SCMP_ARCH_RISCV64,
1371 #endif
1372 SCMP_ARCH_X32
1373 ))
1374 /* No _sysctl syscall */
1375 continue;
1376
1377 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1378 if (r < 0)
1379 return r;
1380
1381 r = seccomp_rule_add_exact(
1382 seccomp,
1383 SCMP_ACT_ERRNO(EPERM),
1384 SCMP_SYS(_sysctl),
1385 0);
1386 if (r < 0) {
1387 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1388 continue;
1389 }
1390
1391 r = seccomp_load(seccomp);
1392 if (ERRNO_IS_SECCOMP_FATAL(r))
1393 return r;
1394 if (r < 0)
1395 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1396 }
1397
1398 return 0;
1399 }
1400
1401 int seccomp_protect_syslog(void) {
1402 uint32_t arch;
1403 int r;
1404
1405 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1406 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1407
1408 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1409 if (r < 0)
1410 return r;
1411
1412 r = seccomp_rule_add_exact(
1413 seccomp,
1414 SCMP_ACT_ERRNO(EPERM),
1415 SCMP_SYS(syslog),
1416 0);
1417
1418 if (r < 0) {
1419 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1420 continue;
1421 }
1422
1423 r = seccomp_load(seccomp);
1424 if (ERRNO_IS_SECCOMP_FATAL(r))
1425 return r;
1426 if (r < 0)
1427 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1428 }
1429
1430 return 0;
1431 }
1432
1433 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1434 uint32_t arch;
1435 int r;
1436
1437 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1438 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1439 bool supported;
1440
1441 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1442
1443 switch (arch) {
1444
1445 case SCMP_ARCH_X86_64:
1446 case SCMP_ARCH_X32:
1447 case SCMP_ARCH_ARM:
1448 case SCMP_ARCH_AARCH64:
1449 case SCMP_ARCH_MIPSEL64N32:
1450 case SCMP_ARCH_MIPS64N32:
1451 case SCMP_ARCH_MIPSEL64:
1452 case SCMP_ARCH_MIPS64:
1453 #ifdef SCMP_ARCH_RISCV64
1454 case SCMP_ARCH_RISCV64:
1455 #endif
1456 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1457 supported = true;
1458 break;
1459
1460 case SCMP_ARCH_S390:
1461 case SCMP_ARCH_S390X:
1462 case SCMP_ARCH_X86:
1463 case SCMP_ARCH_MIPSEL:
1464 case SCMP_ARCH_MIPS:
1465 #ifdef SCMP_ARCH_PARISC
1466 case SCMP_ARCH_PARISC:
1467 #endif
1468 #ifdef SCMP_ARCH_PARISC64
1469 case SCMP_ARCH_PARISC64:
1470 #endif
1471 case SCMP_ARCH_PPC:
1472 case SCMP_ARCH_PPC64:
1473 case SCMP_ARCH_PPC64LE:
1474 default:
1475 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1476 * don't know */
1477 supported = false;
1478 break;
1479 }
1480
1481 if (!supported)
1482 continue;
1483
1484 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1485 if (r < 0)
1486 return r;
1487
1488 if (allow_list) {
1489 int first = 0, last = 0;
1490 void *afp;
1491
1492 /* If this is an allow list, we first block the address families that are out of
1493 * range and then everything that is not in the set. First, we find the lowest and
1494 * highest address family in the set. */
1495
1496 SET_FOREACH(afp, address_families) {
1497 int af = PTR_TO_INT(afp);
1498
1499 if (af <= 0 || af >= af_max())
1500 continue;
1501
1502 if (first == 0 || af < first)
1503 first = af;
1504
1505 if (last == 0 || af > last)
1506 last = af;
1507 }
1508
1509 assert((first == 0) == (last == 0));
1510
1511 if (first == 0) {
1512
1513 /* No entries in the valid range, block everything */
1514 r = seccomp_rule_add_exact(
1515 seccomp,
1516 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1517 SCMP_SYS(socket),
1518 0);
1519 if (r < 0) {
1520 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1521 continue;
1522 }
1523
1524 } else {
1525
1526 /* Block everything below the first entry */
1527 r = seccomp_rule_add_exact(
1528 seccomp,
1529 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1530 SCMP_SYS(socket),
1531 1,
1532 SCMP_A0(SCMP_CMP_LT, first));
1533 if (r < 0) {
1534 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1535 continue;
1536 }
1537
1538 /* Block everything above the last entry */
1539 r = seccomp_rule_add_exact(
1540 seccomp,
1541 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1542 SCMP_SYS(socket),
1543 1,
1544 SCMP_A0(SCMP_CMP_GT, last));
1545 if (r < 0) {
1546 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1547 continue;
1548 }
1549
1550 /* Block everything between the first and last entry */
1551 for (int af = 1; af < af_max(); af++) {
1552
1553 if (set_contains(address_families, INT_TO_PTR(af)))
1554 continue;
1555
1556 r = seccomp_rule_add_exact(
1557 seccomp,
1558 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1559 SCMP_SYS(socket),
1560 1,
1561 SCMP_A0(SCMP_CMP_EQ, af));
1562 if (r < 0)
1563 break;
1564 }
1565 if (r < 0) {
1566 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1567 continue;
1568 }
1569 }
1570
1571 } else {
1572 void *af;
1573
1574 /* If this is a deny list, then generate one rule for each address family that are
1575 * then combined in OR checks. */
1576
1577 SET_FOREACH(af, address_families) {
1578 r = seccomp_rule_add_exact(
1579 seccomp,
1580 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1581 SCMP_SYS(socket),
1582 1,
1583 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1584 if (r < 0)
1585 break;
1586 }
1587 if (r < 0) {
1588 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1589 continue;
1590 }
1591 }
1592
1593 r = seccomp_load(seccomp);
1594 if (ERRNO_IS_SECCOMP_FATAL(r))
1595 return r;
1596 if (r < 0)
1597 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1598 }
1599
1600 return 0;
1601 }
1602
1603 int seccomp_restrict_realtime_full(int error_code) {
1604 static const int permitted_policies[] = {
1605 SCHED_OTHER,
1606 SCHED_BATCH,
1607 SCHED_IDLE,
1608 };
1609
1610 int r, max_policy = 0;
1611 uint32_t arch;
1612 unsigned i;
1613
1614 assert(error_code > 0);
1615
1616 /* Determine the highest policy constant we want to allow */
1617 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1618 if (permitted_policies[i] > max_policy)
1619 max_policy = permitted_policies[i];
1620
1621 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1622 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1623 int p;
1624
1625 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1626
1627 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1628 if (r < 0)
1629 return r;
1630
1631 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1632 * allow list. */
1633 for (p = 0; p < max_policy; p++) {
1634 bool good = false;
1635
1636 /* Check if this is in the allow list. */
1637 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1638 if (permitted_policies[i] == p) {
1639 good = true;
1640 break;
1641 }
1642
1643 if (good)
1644 continue;
1645
1646 /* Deny this policy */
1647 r = seccomp_rule_add_exact(
1648 seccomp,
1649 SCMP_ACT_ERRNO(error_code),
1650 SCMP_SYS(sched_setscheduler),
1651 1,
1652 SCMP_A1(SCMP_CMP_EQ, p));
1653 if (r < 0) {
1654 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1655 continue;
1656 }
1657 }
1658
1659 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1660 * are unsigned here, hence no need no check for < 0 values. */
1661 r = seccomp_rule_add_exact(
1662 seccomp,
1663 SCMP_ACT_ERRNO(error_code),
1664 SCMP_SYS(sched_setscheduler),
1665 1,
1666 SCMP_A1(SCMP_CMP_GT, max_policy));
1667 if (r < 0) {
1668 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1669 continue;
1670 }
1671
1672 r = seccomp_load(seccomp);
1673 if (ERRNO_IS_SECCOMP_FATAL(r))
1674 return r;
1675 if (r < 0)
1676 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1677 }
1678
1679 return 0;
1680 }
1681
1682 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1683 uint32_t arch,
1684 int nr,
1685 unsigned arg_cnt,
1686 const struct scmp_arg_cmp arg) {
1687 int r;
1688
1689 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1690 if (r < 0) {
1691 _cleanup_free_ char *n = NULL;
1692
1693 n = seccomp_syscall_resolve_num_arch(arch, nr);
1694 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1695 strna(n),
1696 seccomp_arch_to_string(arch));
1697 }
1698
1699 return r;
1700 }
1701
1702 /* For known architectures, check that syscalls are indeed defined or not. */
1703 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1704 assert_cc(SCMP_SYS(shmget) > 0);
1705 assert_cc(SCMP_SYS(shmat) > 0);
1706 assert_cc(SCMP_SYS(shmdt) > 0);
1707 #endif
1708
1709 int seccomp_memory_deny_write_execute(void) {
1710 uint32_t arch;
1711 unsigned loaded = 0;
1712
1713 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1714 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1715 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1716
1717 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1718
1719 switch (arch) {
1720
1721 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1722 * We ignore that here, which means there's still a way to get writable/executable
1723 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1724 *
1725 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1726 * on that front (kernel work done in 5.18).
1727 */
1728
1729 case SCMP_ARCH_X86:
1730 case SCMP_ARCH_S390:
1731 filter_syscall = SCMP_SYS(mmap2);
1732 block_syscall = SCMP_SYS(mmap);
1733 /* shmat multiplexed, see above */
1734 break;
1735
1736 case SCMP_ARCH_PPC:
1737 case SCMP_ARCH_PPC64:
1738 case SCMP_ARCH_PPC64LE:
1739 case SCMP_ARCH_S390X:
1740 filter_syscall = SCMP_SYS(mmap);
1741 /* shmat multiplexed, see above */
1742 break;
1743
1744 case SCMP_ARCH_ARM:
1745 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1746 shmat_syscall = SCMP_SYS(shmat);
1747 break;
1748
1749 case SCMP_ARCH_X86_64:
1750 case SCMP_ARCH_X32:
1751 case SCMP_ARCH_AARCH64:
1752 #ifdef SCMP_ARCH_RISCV64
1753 case SCMP_ARCH_RISCV64:
1754 #endif
1755 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1756 shmat_syscall = SCMP_SYS(shmat);
1757 break;
1758
1759 /* Please add more definitions here, if you port systemd to other architectures! */
1760
1761 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1762 #warning "Consider adding the right mmap() syscall definitions here!"
1763 #endif
1764 }
1765
1766 /* Can't filter mmap() on this arch, then skip it */
1767 if (filter_syscall == 0)
1768 continue;
1769
1770 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1771 if (r < 0)
1772 return r;
1773
1774 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1775 1,
1776 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1777 if (r < 0)
1778 continue;
1779
1780 if (block_syscall != 0) {
1781 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1782 if (r < 0)
1783 continue;
1784 }
1785
1786 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1787 1,
1788 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1789 if (r < 0)
1790 continue;
1791
1792 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1793 1,
1794 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1795 if (r < 0)
1796 continue;
1797
1798 if (shmat_syscall > 0) {
1799 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1800 1,
1801 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1802 if (r < 0)
1803 continue;
1804 }
1805
1806 r = seccomp_load(seccomp);
1807 if (ERRNO_IS_SECCOMP_FATAL(r))
1808 return r;
1809 if (r < 0)
1810 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1811 seccomp_arch_to_string(arch));
1812 loaded++;
1813 }
1814
1815 if (loaded == 0)
1816 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1817
1818 return loaded;
1819 }
1820
1821 int seccomp_restrict_archs(Set *archs) {
1822 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1823 int r;
1824 bool blocked_new = false;
1825
1826 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1827 * list.
1828 *
1829 * There are some qualifications. However the most important use is to stop processes from bypassing
1830 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1831 * in a non-native architecture. There are no holes in this use case, at least so far. */
1832
1833 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1834 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1835 * to run a program with the restrictions applied. */
1836 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1837 if (!seccomp)
1838 return -ENOMEM;
1839
1840 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1841 uint32_t arch = seccomp_local_archs[i];
1842
1843 /* See above comment, our "native" architecture is never blocked. */
1844 if (arch == seccomp_arch_native())
1845 continue;
1846
1847 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1848 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1849 continue;
1850
1851 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
1852
1853 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1854 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1855 * The important thing is that you can block the old 32-bit x86 syscalls.
1856 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1857 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1858 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1859
1860 if (block) {
1861 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1862 blocked_new = true;
1863 } else {
1864 r = seccomp_arch_add(seccomp, arch);
1865 if (r < 0 && r != -EEXIST)
1866 return r;
1867 }
1868 }
1869
1870 /* All architectures that will be blocked by the seccomp program were
1871 * already blocked. */
1872 if (!blocked_new)
1873 return 0;
1874
1875 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1876 if (r < 0)
1877 return r;
1878
1879 r = seccomp_load(seccomp);
1880 if (ERRNO_IS_SECCOMP_FATAL(r))
1881 return r;
1882 if (r < 0)
1883 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1884
1885 return 0;
1886 }
1887
1888 int parse_syscall_archs(char **l, Set **ret_archs) {
1889 _cleanup_set_free_ Set *archs = NULL;
1890 int r;
1891
1892 assert(l);
1893 assert(ret_archs);
1894
1895 STRV_FOREACH(s, l) {
1896 uint32_t a;
1897
1898 r = seccomp_arch_from_string(*s, &a);
1899 if (r < 0)
1900 return -EINVAL;
1901
1902 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1903 if (r < 0)
1904 return -ENOMEM;
1905 }
1906
1907 *ret_archs = TAKE_PTR(archs);
1908 return 0;
1909 }
1910
1911 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1912 const char *i;
1913 int r;
1914
1915 assert(set);
1916
1917 NULSTR_FOREACH(i, set->value) {
1918
1919 if (i[0] == '@') {
1920 const SyscallFilterSet *more;
1921
1922 more = syscall_filter_set_find(i);
1923 if (!more)
1924 return -ENXIO;
1925
1926 r = seccomp_filter_set_add(filter, add, more);
1927 if (r < 0)
1928 return r;
1929 } else {
1930 int id;
1931
1932 id = seccomp_syscall_resolve_name(i);
1933 if (id == __NR_SCMP_ERROR) {
1934 log_debug("Couldn't resolve system call, ignoring: %s", i);
1935 continue;
1936 }
1937
1938 if (add) {
1939 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1940 if (r < 0)
1941 return r;
1942 } else
1943 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1944 }
1945 }
1946
1947 return 0;
1948 }
1949
1950 int seccomp_lock_personality(unsigned long personality) {
1951 uint32_t arch;
1952 int r;
1953
1954 if (personality >= PERSONALITY_INVALID)
1955 return -EINVAL;
1956
1957 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1958 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1959
1960 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1961 if (r < 0)
1962 return r;
1963
1964 r = seccomp_rule_add_exact(
1965 seccomp,
1966 SCMP_ACT_ERRNO(EPERM),
1967 SCMP_SYS(personality),
1968 1,
1969 SCMP_A0(SCMP_CMP_NE, personality));
1970 if (r < 0) {
1971 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1972 continue;
1973 }
1974
1975 r = seccomp_load(seccomp);
1976 if (ERRNO_IS_SECCOMP_FATAL(r))
1977 return r;
1978 if (r < 0)
1979 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1980 }
1981
1982 return 0;
1983 }
1984
1985 int seccomp_protect_hostname(void) {
1986 uint32_t arch;
1987 int r;
1988
1989 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1990 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1991
1992 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1993 if (r < 0)
1994 return r;
1995
1996 r = seccomp_rule_add_exact(
1997 seccomp,
1998 SCMP_ACT_ERRNO(EPERM),
1999 SCMP_SYS(sethostname),
2000 0);
2001 if (r < 0) {
2002 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2003 continue;
2004 }
2005
2006 r = seccomp_rule_add_exact(
2007 seccomp,
2008 SCMP_ACT_ERRNO(EPERM),
2009 SCMP_SYS(setdomainname),
2010 0);
2011 if (r < 0) {
2012 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2013 continue;
2014 }
2015
2016 r = seccomp_load(seccomp);
2017 if (ERRNO_IS_SECCOMP_FATAL(r))
2018 return r;
2019 if (r < 0)
2020 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2021 }
2022
2023 return 0;
2024 }
2025
2026 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
2027 /* Checks the mode_t parameter of the following system calls:
2028 *
2029 * → chmod() + fchmod() + fchmodat()
2030 * → open() + creat() + openat()
2031 * → mkdir() + mkdirat()
2032 * → mknod() + mknodat()
2033 *
2034 * Returns error if *everything* failed, and 0 otherwise.
2035 */
2036 int r;
2037 bool any = false;
2038
2039 r = seccomp_rule_add_exact(
2040 seccomp,
2041 SCMP_ACT_ERRNO(EPERM),
2042 SCMP_SYS(chmod),
2043 1,
2044 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2045 if (r < 0)
2046 log_debug_errno(r, "Failed to add filter for chmod: %m");
2047 else
2048 any = true;
2049
2050 r = seccomp_rule_add_exact(
2051 seccomp,
2052 SCMP_ACT_ERRNO(EPERM),
2053 SCMP_SYS(fchmod),
2054 1,
2055 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2056 if (r < 0)
2057 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2058 else
2059 any = true;
2060
2061 r = seccomp_rule_add_exact(
2062 seccomp,
2063 SCMP_ACT_ERRNO(EPERM),
2064 SCMP_SYS(fchmodat),
2065 1,
2066 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2067 if (r < 0)
2068 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2069 else
2070 any = true;
2071
2072 r = seccomp_rule_add_exact(
2073 seccomp,
2074 SCMP_ACT_ERRNO(EPERM),
2075 SCMP_SYS(mkdir),
2076 1,
2077 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2078 if (r < 0)
2079 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2080 else
2081 any = true;
2082
2083 r = seccomp_rule_add_exact(
2084 seccomp,
2085 SCMP_ACT_ERRNO(EPERM),
2086 SCMP_SYS(mkdirat),
2087 1,
2088 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2089 if (r < 0)
2090 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2091 else
2092 any = true;
2093
2094 r = seccomp_rule_add_exact(
2095 seccomp,
2096 SCMP_ACT_ERRNO(EPERM),
2097 SCMP_SYS(mknod),
2098 1,
2099 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2100 if (r < 0)
2101 log_debug_errno(r, "Failed to add filter for mknod: %m");
2102 else
2103 any = true;
2104
2105 r = seccomp_rule_add_exact(
2106 seccomp,
2107 SCMP_ACT_ERRNO(EPERM),
2108 SCMP_SYS(mknodat),
2109 1,
2110 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2111 if (r < 0)
2112 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2113 else
2114 any = true;
2115
2116 r = seccomp_rule_add_exact(
2117 seccomp,
2118 SCMP_ACT_ERRNO(EPERM),
2119 SCMP_SYS(open),
2120 2,
2121 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2122 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2123 if (r < 0)
2124 log_debug_errno(r, "Failed to add filter for open: %m");
2125 else
2126 any = true;
2127
2128 r = seccomp_rule_add_exact(
2129 seccomp,
2130 SCMP_ACT_ERRNO(EPERM),
2131 SCMP_SYS(openat),
2132 2,
2133 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2134 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2135 if (r < 0)
2136 log_debug_errno(r, "Failed to add filter for openat: %m");
2137 else
2138 any = true;
2139
2140 #if defined(__SNR_openat2)
2141 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2142 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2143 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2144 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2145 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2146 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2147 r = seccomp_rule_add_exact(
2148 seccomp,
2149 SCMP_ACT_ERRNO(ENOSYS),
2150 SCMP_SYS(openat2),
2151 0);
2152 if (r < 0)
2153 log_debug_errno(r, "Failed to add filter for openat2: %m");
2154 else
2155 any = true;
2156 #endif
2157
2158 r = seccomp_rule_add_exact(
2159 seccomp,
2160 SCMP_ACT_ERRNO(EPERM),
2161 SCMP_SYS(creat),
2162 1,
2163 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2164 if (r < 0)
2165 log_debug_errno(r, "Failed to add filter for creat: %m");
2166 else
2167 any = true;
2168
2169 return any ? 0 : r;
2170 }
2171
2172 int seccomp_restrict_suid_sgid(void) {
2173 uint32_t arch;
2174 int r, k;
2175
2176 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2177 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2178
2179 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2180 if (r < 0)
2181 return r;
2182
2183 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2184 if (r < 0)
2185 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2186
2187 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2188 if (k < 0)
2189 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2190
2191 if (r < 0 && k < 0)
2192 continue;
2193
2194 r = seccomp_load(seccomp);
2195 if (ERRNO_IS_SECCOMP_FATAL(r))
2196 return r;
2197 if (r < 0)
2198 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2199 }
2200
2201 return 0;
2202 }
2203
2204 uint32_t scmp_act_kill_process(void) {
2205
2206 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2207 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2208 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2209 * for single-threaded apps does the right thing. */
2210
2211 #ifdef SCMP_ACT_KILL_PROCESS
2212 if (seccomp_api_get() >= 3)
2213 return SCMP_ACT_KILL_PROCESS;
2214 #endif
2215
2216 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2217 }
2218
2219 int parse_syscall_and_errno(const char *in, char **name, int *error) {
2220 _cleanup_free_ char *n = NULL;
2221 char *p;
2222 int e = -1;
2223
2224 assert(in);
2225 assert(name);
2226 assert(error);
2227
2228 /*
2229 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2230 * If errno is omitted, then error is set to -1.
2231 * Empty syscall name is not allowed.
2232 * Here, we do not check that the syscall name is valid or not.
2233 */
2234
2235 p = strchr(in, ':');
2236 if (p) {
2237 e = seccomp_parse_errno_or_action(p + 1);
2238 if (e < 0)
2239 return e;
2240
2241 n = strndup(in, p - in);
2242 } else
2243 n = strdup(in);
2244
2245 if (!n)
2246 return -ENOMEM;
2247
2248 if (isempty(n))
2249 return -EINVAL;
2250
2251 *error = e;
2252 *name = TAKE_PTR(n);
2253
2254 return 0;
2255 }
2256
2257 static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2258 bool any = false;
2259 int r;
2260
2261 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2262 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2263
2264 r = seccomp_rule_add_exact(
2265 seccomp,
2266 SCMP_ACT_ERRNO(EINVAL),
2267 SCMP_SYS(open),
2268 1,
2269 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2270 if (r < 0)
2271 log_debug_errno(r, "Failed to add filter for open: %m");
2272 else
2273 any = true;
2274
2275 r = seccomp_rule_add_exact(
2276 seccomp,
2277 SCMP_ACT_ERRNO(EINVAL),
2278 SCMP_SYS(openat),
2279 1,
2280 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2281 if (r < 0)
2282 log_debug_errno(r, "Failed to add filter for openat: %m");
2283 else
2284 any = true;
2285
2286 #if defined(__SNR_openat2)
2287 /* The new openat2() system call can't be filtered sensibly, see above. */
2288 r = seccomp_rule_add_exact(
2289 seccomp,
2290 SCMP_ACT_ERRNO(ENOSYS),
2291 SCMP_SYS(openat2),
2292 0);
2293 if (r < 0)
2294 log_debug_errno(r, "Failed to add filter for openat2: %m");
2295 else
2296 any = true;
2297 #endif
2298
2299 return any ? 0 : r;
2300 }
2301
2302 int seccomp_suppress_sync(void) {
2303 uint32_t arch;
2304 int r;
2305
2306 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2307 * manageable, and also masks O_SYNC/O_DSYNC */
2308
2309 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2310 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2311 const char *c;
2312
2313 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2314 if (r < 0)
2315 return r;
2316
2317 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2318 int id;
2319
2320 id = seccomp_syscall_resolve_name(c);
2321 if (id == __NR_SCMP_ERROR) {
2322 log_debug("System call %s is not known, ignoring.", c);
2323 continue;
2324 }
2325
2326 r = seccomp_rule_add_exact(
2327 seccomp,
2328 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2329 id,
2330 0);
2331 if (r < 0)
2332 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2333 }
2334
2335 (void) block_open_flag(seccomp, O_SYNC);
2336 #if O_DSYNC != O_SYNC
2337 (void) block_open_flag(seccomp, O_DSYNC);
2338 #endif
2339
2340 r = seccomp_load(seccomp);
2341 if (ERRNO_IS_SECCOMP_FATAL(r))
2342 return r;
2343 if (r < 0)
2344 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2345 }
2346
2347 return 0;
2348 }