]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #28741 from keszybz/minor-tweaks-for-recent-patches
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10 #include <sys/stat.h>
11
12 /* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13 #include "missing_syscall_def.h"
14 #include <seccomp.h>
15
16 #include "af-list.h"
17 #include "alloc-util.h"
18 #include "env-util.h"
19 #include "errno-list.h"
20 #include "macro.h"
21 #include "namespace-util.h"
22 #include "nsflags.h"
23 #include "nulstr-util.h"
24 #include "process-util.h"
25 #include "seccomp-util.h"
26 #include "set.h"
27 #include "string-util.h"
28 #include "strv.h"
29
30 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
31 uint32_t seccomp_local_archs[] = {
32
33 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
34
35 #if defined(__x86_64__) && defined(__ILP32__)
36 SCMP_ARCH_X86,
37 SCMP_ARCH_X86_64,
38 SCMP_ARCH_X32, /* native */
39 #elif defined(__x86_64__) && !defined(__ILP32__)
40 SCMP_ARCH_X86,
41 SCMP_ARCH_X32,
42 SCMP_ARCH_X86_64, /* native */
43 #elif defined(__i386__)
44 SCMP_ARCH_X86,
45 #elif defined(__aarch64__)
46 SCMP_ARCH_ARM,
47 SCMP_ARCH_AARCH64, /* native */
48 #elif defined(__arm__)
49 SCMP_ARCH_ARM,
50 #elif defined(__loongarch_lp64)
51 SCMP_ARCH_LOONGARCH64,
52 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
53 SCMP_ARCH_MIPSEL,
54 SCMP_ARCH_MIPS, /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
56 SCMP_ARCH_MIPS,
57 SCMP_ARCH_MIPSEL, /* native */
58 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64N32,
63 SCMP_ARCH_MIPSEL64,
64 SCMP_ARCH_MIPS64, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPS,
67 SCMP_ARCH_MIPSEL,
68 SCMP_ARCH_MIPS64N32,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64,
71 SCMP_ARCH_MIPSEL64, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64,
77 SCMP_ARCH_MIPSEL64N32,
78 SCMP_ARCH_MIPS64N32, /* native */
79 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPS,
81 SCMP_ARCH_MIPSEL,
82 SCMP_ARCH_MIPS64,
83 SCMP_ARCH_MIPSEL64,
84 SCMP_ARCH_MIPS64N32,
85 SCMP_ARCH_MIPSEL64N32, /* native */
86 #elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
87 SCMP_ARCH_PARISC,
88 SCMP_ARCH_PARISC64, /* native */
89 #elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
90 SCMP_ARCH_PARISC,
91 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
92 SCMP_ARCH_PPC,
93 SCMP_ARCH_PPC64LE,
94 SCMP_ARCH_PPC64, /* native */
95 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
96 SCMP_ARCH_PPC,
97 SCMP_ARCH_PPC64,
98 SCMP_ARCH_PPC64LE, /* native */
99 #elif defined(__powerpc__)
100 SCMP_ARCH_PPC,
101 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
102 SCMP_ARCH_RISCV64,
103 #elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106 #elif defined(__s390__)
107 SCMP_ARCH_S390,
108 #endif
109 SECCOMP_LOCAL_ARCH_END
110 };
111
112 const char* seccomp_arch_to_string(uint32_t c) {
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
117
118 switch (c) {
119 case SCMP_ARCH_NATIVE:
120 return "native";
121 case SCMP_ARCH_X86:
122 return "x86";
123 case SCMP_ARCH_X86_64:
124 return "x86-64";
125 case SCMP_ARCH_X32:
126 return "x32";
127 case SCMP_ARCH_ARM:
128 return "arm";
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
131 #ifdef SCMP_ARCH_LOONGARCH64
132 case SCMP_ARCH_LOONGARCH64:
133 return "loongarch64";
134 #endif
135 case SCMP_ARCH_MIPS:
136 return "mips";
137 case SCMP_ARCH_MIPS64:
138 return "mips64";
139 case SCMP_ARCH_MIPS64N32:
140 return "mips64-n32";
141 case SCMP_ARCH_MIPSEL:
142 return "mips-le";
143 case SCMP_ARCH_MIPSEL64:
144 return "mips64-le";
145 case SCMP_ARCH_MIPSEL64N32:
146 return "mips64-le-n32";
147 #ifdef SCMP_ARCH_PARISC
148 case SCMP_ARCH_PARISC:
149 return "parisc";
150 #endif
151 #ifdef SCMP_ARCH_PARISC64
152 case SCMP_ARCH_PARISC64:
153 return "parisc64";
154 #endif
155 case SCMP_ARCH_PPC:
156 return "ppc";
157 case SCMP_ARCH_PPC64:
158 return "ppc64";
159 case SCMP_ARCH_PPC64LE:
160 return "ppc64-le";
161 #ifdef SCMP_ARCH_RISCV64
162 case SCMP_ARCH_RISCV64:
163 return "riscv64";
164 #endif
165 case SCMP_ARCH_S390:
166 return "s390";
167 case SCMP_ARCH_S390X:
168 return "s390x";
169 default:
170 return NULL;
171 }
172 }
173
174 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
175 if (!n)
176 return -EINVAL;
177
178 assert(ret);
179
180 if (streq(n, "native"))
181 *ret = SCMP_ARCH_NATIVE;
182 else if (streq(n, "x86"))
183 *ret = SCMP_ARCH_X86;
184 else if (streq(n, "x86-64"))
185 *ret = SCMP_ARCH_X86_64;
186 else if (streq(n, "x32"))
187 *ret = SCMP_ARCH_X32;
188 else if (streq(n, "arm"))
189 *ret = SCMP_ARCH_ARM;
190 else if (streq(n, "arm64"))
191 *ret = SCMP_ARCH_AARCH64;
192 #ifdef SCMP_ARCH_LOONGARCH64
193 else if (streq(n, "loongarch64"))
194 *ret = SCMP_ARCH_LOONGARCH64;
195 #endif
196 else if (streq(n, "mips"))
197 *ret = SCMP_ARCH_MIPS;
198 else if (streq(n, "mips64"))
199 *ret = SCMP_ARCH_MIPS64;
200 else if (streq(n, "mips64-n32"))
201 *ret = SCMP_ARCH_MIPS64N32;
202 else if (streq(n, "mips-le"))
203 *ret = SCMP_ARCH_MIPSEL;
204 else if (streq(n, "mips64-le"))
205 *ret = SCMP_ARCH_MIPSEL64;
206 else if (streq(n, "mips64-le-n32"))
207 *ret = SCMP_ARCH_MIPSEL64N32;
208 #ifdef SCMP_ARCH_PARISC
209 else if (streq(n, "parisc"))
210 *ret = SCMP_ARCH_PARISC;
211 #endif
212 #ifdef SCMP_ARCH_PARISC64
213 else if (streq(n, "parisc64"))
214 *ret = SCMP_ARCH_PARISC64;
215 #endif
216 else if (streq(n, "ppc"))
217 *ret = SCMP_ARCH_PPC;
218 else if (streq(n, "ppc64"))
219 *ret = SCMP_ARCH_PPC64;
220 else if (streq(n, "ppc64-le"))
221 *ret = SCMP_ARCH_PPC64LE;
222 #ifdef SCMP_ARCH_RISCV64
223 else if (streq(n, "riscv64"))
224 *ret = SCMP_ARCH_RISCV64;
225 #endif
226 else if (streq(n, "s390"))
227 *ret = SCMP_ARCH_S390;
228 else if (streq(n, "s390x"))
229 *ret = SCMP_ARCH_S390X;
230 else
231 return -EINVAL;
232
233 return 0;
234 }
235
236 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
237 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
238 int r;
239
240 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
241 * any others. Also, turns off the NNP fiddling. */
242
243 seccomp = seccomp_init(default_action);
244 if (!seccomp)
245 return -ENOMEM;
246
247 if (arch != SCMP_ARCH_NATIVE &&
248 arch != seccomp_arch_native()) {
249
250 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
251 if (r < 0)
252 return r;
253
254 r = seccomp_arch_add(seccomp, arch);
255 if (r < 0)
256 return r;
257
258 assert(seccomp_arch_exist(seccomp, arch) >= 0);
259 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
260 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
261 } else {
262 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
263 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
264 }
265
266 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
267 if (r < 0)
268 return r;
269
270 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
271 if (r < 0)
272 return r;
273
274 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
275 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
276 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
277 if (r < 0)
278 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
279 }
280 #endif
281
282 *ret = TAKE_PTR(seccomp);
283 return 0;
284 }
285
286 static bool is_basic_seccomp_available(void) {
287 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
288 }
289
290 static bool is_seccomp_filter_available(void) {
291 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
292 errno == EFAULT;
293 }
294
295 bool is_seccomp_available(void) {
296 static int cached_enabled = -1;
297
298 if (cached_enabled < 0) {
299 int b;
300
301 b = getenv_bool_secure("SYSTEMD_SECCOMP");
302 if (b != 0) {
303 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
304 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
305
306 cached_enabled =
307 is_basic_seccomp_available() &&
308 is_seccomp_filter_available();
309 } else
310 cached_enabled = false;
311 }
312
313 return cached_enabled;
314 }
315
316 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
317 [SYSCALL_FILTER_SET_DEFAULT] = {
318 .name = "@default",
319 .help = "System calls that are always permitted",
320 .value =
321 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
322 "brk\0"
323 "cacheflush\0"
324 "clock_getres\0"
325 "clock_getres_time64\0"
326 "clock_gettime\0"
327 "clock_gettime64\0"
328 "clock_nanosleep\0"
329 "clock_nanosleep_time64\0"
330 "execve\0"
331 "exit\0"
332 "exit_group\0"
333 "futex\0"
334 "futex_time64\0"
335 "futex_waitv\0"
336 "get_robust_list\0"
337 "get_thread_area\0"
338 "getegid\0"
339 "getegid32\0"
340 "geteuid\0"
341 "geteuid32\0"
342 "getgid\0"
343 "getgid32\0"
344 "getgroups\0"
345 "getgroups32\0"
346 "getpgid\0"
347 "getpgrp\0"
348 "getpid\0"
349 "getppid\0"
350 "getrandom\0"
351 "getresgid\0"
352 "getresgid32\0"
353 "getresuid\0"
354 "getresuid32\0"
355 "getrlimit\0" /* make sure processes can query stack size and such */
356 "getsid\0"
357 "gettid\0"
358 "gettimeofday\0"
359 "getuid\0"
360 "getuid32\0"
361 "membarrier\0"
362 "mmap\0"
363 "mmap2\0"
364 "mprotect\0"
365 "munmap\0"
366 "nanosleep\0"
367 "pause\0"
368 "prlimit64\0"
369 "restart_syscall\0"
370 "riscv_flush_icache\0"
371 "riscv_hwprobe\0"
372 "rseq\0"
373 "rt_sigreturn\0"
374 "sched_getaffinity\0"
375 "sched_yield\0"
376 "set_robust_list\0"
377 "set_thread_area\0"
378 "set_tid_address\0"
379 "set_tls\0"
380 "sigreturn\0"
381 "time\0"
382 "ugetrlimit\0"
383 },
384 [SYSCALL_FILTER_SET_AIO] = {
385 .name = "@aio",
386 .help = "Asynchronous IO",
387 .value =
388 "io_cancel\0"
389 "io_destroy\0"
390 "io_getevents\0"
391 "io_pgetevents\0"
392 "io_pgetevents_time64\0"
393 "io_setup\0"
394 "io_submit\0"
395 "io_uring_enter\0"
396 "io_uring_register\0"
397 "io_uring_setup\0"
398 },
399 [SYSCALL_FILTER_SET_BASIC_IO] = {
400 .name = "@basic-io",
401 .help = "Basic IO",
402 .value =
403 "_llseek\0"
404 "close\0"
405 "close_range\0"
406 "dup\0"
407 "dup2\0"
408 "dup3\0"
409 "lseek\0"
410 "pread64\0"
411 "preadv\0"
412 "preadv2\0"
413 "pwrite64\0"
414 "pwritev\0"
415 "pwritev2\0"
416 "read\0"
417 "readv\0"
418 "write\0"
419 "writev\0"
420 },
421 [SYSCALL_FILTER_SET_CHOWN] = {
422 .name = "@chown",
423 .help = "Change ownership of files and directories",
424 .value =
425 "chown\0"
426 "chown32\0"
427 "fchown\0"
428 "fchown32\0"
429 "fchownat\0"
430 "lchown\0"
431 "lchown32\0"
432 },
433 [SYSCALL_FILTER_SET_CLOCK] = {
434 .name = "@clock",
435 .help = "Change the system time",
436 .value =
437 "adjtimex\0"
438 "clock_adjtime\0"
439 "clock_adjtime64\0"
440 "clock_settime\0"
441 "clock_settime64\0"
442 "settimeofday\0"
443 },
444 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
445 .name = "@cpu-emulation",
446 .help = "System calls for CPU emulation functionality",
447 .value =
448 "modify_ldt\0"
449 "subpage_prot\0"
450 "switch_endian\0"
451 "vm86\0"
452 "vm86old\0"
453 },
454 [SYSCALL_FILTER_SET_DEBUG] = {
455 .name = "@debug",
456 .help = "Debugging, performance monitoring and tracing functionality",
457 .value =
458 "lookup_dcookie\0"
459 "perf_event_open\0"
460 "pidfd_getfd\0"
461 "ptrace\0"
462 "rtas\0"
463 "s390_runtime_instr\0"
464 "sys_debug_setcontext\0"
465 },
466 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
467 .name = "@file-system",
468 .help = "File system operations",
469 .value =
470 "access\0"
471 "chdir\0"
472 "chmod\0"
473 "close\0"
474 "creat\0"
475 "faccessat\0"
476 "faccessat2\0"
477 "fallocate\0"
478 "fchdir\0"
479 "fchmod\0"
480 "fchmodat\0"
481 "fcntl\0"
482 "fcntl64\0"
483 "fgetxattr\0"
484 "flistxattr\0"
485 "fremovexattr\0"
486 "fsetxattr\0"
487 "fstat\0"
488 "fstat64\0"
489 "fstatat64\0"
490 "fstatfs\0"
491 "fstatfs64\0"
492 "ftruncate\0"
493 "ftruncate64\0"
494 "futimesat\0"
495 "getcwd\0"
496 "getdents\0"
497 "getdents64\0"
498 "getxattr\0"
499 "inotify_add_watch\0"
500 "inotify_init\0"
501 "inotify_init1\0"
502 "inotify_rm_watch\0"
503 "lgetxattr\0"
504 "link\0"
505 "linkat\0"
506 "listxattr\0"
507 "llistxattr\0"
508 "lremovexattr\0"
509 "lsetxattr\0"
510 "lstat\0"
511 "lstat64\0"
512 "mkdir\0"
513 "mkdirat\0"
514 "mknod\0"
515 "mknodat\0"
516 "newfstatat\0"
517 "oldfstat\0"
518 "oldlstat\0"
519 "oldstat\0"
520 "open\0"
521 "openat\0"
522 "openat2\0"
523 "readlink\0"
524 "readlinkat\0"
525 "removexattr\0"
526 "rename\0"
527 "renameat\0"
528 "renameat2\0"
529 "rmdir\0"
530 "setxattr\0"
531 "stat\0"
532 "stat64\0"
533 "statfs\0"
534 "statfs64\0"
535 "statx\0"
536 "symlink\0"
537 "symlinkat\0"
538 "truncate\0"
539 "truncate64\0"
540 "unlink\0"
541 "unlinkat\0"
542 "utime\0"
543 "utimensat\0"
544 "utimensat_time64\0"
545 "utimes\0"
546 },
547 [SYSCALL_FILTER_SET_IO_EVENT] = {
548 .name = "@io-event",
549 .help = "Event loop system calls",
550 .value =
551 "_newselect\0"
552 "epoll_create\0"
553 "epoll_create1\0"
554 "epoll_ctl\0"
555 "epoll_ctl_old\0"
556 "epoll_pwait\0"
557 "epoll_pwait2\0"
558 "epoll_wait\0"
559 "epoll_wait_old\0"
560 "eventfd\0"
561 "eventfd2\0"
562 "poll\0"
563 "ppoll\0"
564 "ppoll_time64\0"
565 "pselect6\0"
566 "pselect6_time64\0"
567 "select\0"
568 },
569 [SYSCALL_FILTER_SET_IPC] = {
570 .name = "@ipc",
571 .help = "SysV IPC, POSIX Message Queues or other IPC",
572 .value =
573 "ipc\0"
574 "memfd_create\0"
575 "mq_getsetattr\0"
576 "mq_notify\0"
577 "mq_open\0"
578 "mq_timedreceive\0"
579 "mq_timedreceive_time64\0"
580 "mq_timedsend\0"
581 "mq_timedsend_time64\0"
582 "mq_unlink\0"
583 "msgctl\0"
584 "msgget\0"
585 "msgrcv\0"
586 "msgsnd\0"
587 "pipe\0"
588 "pipe2\0"
589 "process_madvise\0"
590 "process_vm_readv\0"
591 "process_vm_writev\0"
592 "semctl\0"
593 "semget\0"
594 "semop\0"
595 "semtimedop\0"
596 "semtimedop_time64\0"
597 "shmat\0"
598 "shmctl\0"
599 "shmdt\0"
600 "shmget\0"
601 },
602 [SYSCALL_FILTER_SET_KEYRING] = {
603 .name = "@keyring",
604 .help = "Kernel keyring access",
605 .value =
606 "add_key\0"
607 "keyctl\0"
608 "request_key\0"
609 },
610 [SYSCALL_FILTER_SET_MEMLOCK] = {
611 .name = "@memlock",
612 .help = "Memory locking control",
613 .value =
614 "mlock\0"
615 "mlock2\0"
616 "mlockall\0"
617 "munlock\0"
618 "munlockall\0"
619 },
620 [SYSCALL_FILTER_SET_MODULE] = {
621 .name = "@module",
622 .help = "Loading and unloading of kernel modules",
623 .value =
624 "delete_module\0"
625 "finit_module\0"
626 "init_module\0"
627 },
628 [SYSCALL_FILTER_SET_MOUNT] = {
629 .name = "@mount",
630 .help = "Mounting and unmounting of file systems",
631 .value =
632 "chroot\0"
633 "fsconfig\0"
634 "fsmount\0"
635 "fsopen\0"
636 "fspick\0"
637 "mount\0"
638 "mount_setattr\0"
639 "move_mount\0"
640 "open_tree\0"
641 "pivot_root\0"
642 "umount\0"
643 "umount2\0"
644 },
645 [SYSCALL_FILTER_SET_NETWORK_IO] = {
646 .name = "@network-io",
647 .help = "Network or Unix socket IO, should not be needed if not network facing",
648 .value =
649 "accept\0"
650 "accept4\0"
651 "bind\0"
652 "connect\0"
653 "getpeername\0"
654 "getsockname\0"
655 "getsockopt\0"
656 "listen\0"
657 "recv\0"
658 "recvfrom\0"
659 "recvmmsg\0"
660 "recvmmsg_time64\0"
661 "recvmsg\0"
662 "send\0"
663 "sendmmsg\0"
664 "sendmsg\0"
665 "sendto\0"
666 "setsockopt\0"
667 "shutdown\0"
668 "socket\0"
669 "socketcall\0"
670 "socketpair\0"
671 },
672 [SYSCALL_FILTER_SET_OBSOLETE] = {
673 /* some unknown even to libseccomp */
674 .name = "@obsolete",
675 .help = "Unusual, obsolete or unimplemented system calls",
676 .value =
677 "_sysctl\0"
678 "afs_syscall\0"
679 "bdflush\0"
680 "break\0"
681 "create_module\0"
682 "ftime\0"
683 "get_kernel_syms\0"
684 "getpmsg\0"
685 "gtty\0"
686 "idle\0"
687 "lock\0"
688 "mpx\0"
689 "prof\0"
690 "profil\0"
691 "putpmsg\0"
692 "query_module\0"
693 "security\0"
694 "sgetmask\0"
695 "ssetmask\0"
696 "stime\0"
697 "stty\0"
698 "sysfs\0"
699 "tuxcall\0"
700 "ulimit\0"
701 "uselib\0"
702 "ustat\0"
703 "vserver\0"
704 },
705 [SYSCALL_FILTER_SET_PKEY] = {
706 .name = "@pkey",
707 .help = "System calls used for memory protection keys",
708 .value =
709 "pkey_alloc\0"
710 "pkey_free\0"
711 "pkey_mprotect\0"
712 },
713 [SYSCALL_FILTER_SET_PRIVILEGED] = {
714 .name = "@privileged",
715 .help = "All system calls which need super-user capabilities",
716 .value =
717 "@chown\0"
718 "@clock\0"
719 "@module\0"
720 "@raw-io\0"
721 "@reboot\0"
722 "@swap\0"
723 "_sysctl\0"
724 "acct\0"
725 "bpf\0"
726 "capset\0"
727 "chroot\0"
728 "fanotify_init\0"
729 "fanotify_mark\0"
730 "nfsservctl\0"
731 "open_by_handle_at\0"
732 "pivot_root\0"
733 "quotactl\0"
734 "quotactl_fd\0"
735 "setdomainname\0"
736 "setfsuid\0"
737 "setfsuid32\0"
738 "setgroups\0"
739 "setgroups32\0"
740 "sethostname\0"
741 "setresuid\0"
742 "setresuid32\0"
743 "setreuid\0"
744 "setreuid32\0"
745 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
746 "setuid32\0"
747 "vhangup\0"
748 },
749 [SYSCALL_FILTER_SET_PROCESS] = {
750 .name = "@process",
751 .help = "Process control, execution, namespacing operations",
752 .value =
753 "capget\0" /* Able to query arbitrary processes */
754 "clone\0"
755 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
756 * implement seccomp, so we don't need to list it at all. C.f.
757 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
758 "clone3\0"
759 "execveat\0"
760 "fork\0"
761 "getrusage\0"
762 "kill\0"
763 "pidfd_open\0"
764 "pidfd_send_signal\0"
765 "prctl\0"
766 "rt_sigqueueinfo\0"
767 "rt_tgsigqueueinfo\0"
768 "setns\0"
769 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
770 "tgkill\0"
771 "times\0"
772 "tkill\0"
773 "unshare\0"
774 "vfork\0"
775 "wait4\0"
776 "waitid\0"
777 "waitpid\0"
778 },
779 [SYSCALL_FILTER_SET_RAW_IO] = {
780 .name = "@raw-io",
781 .help = "Raw I/O port access",
782 .value =
783 "ioperm\0"
784 "iopl\0"
785 "pciconfig_iobase\0"
786 "pciconfig_read\0"
787 "pciconfig_write\0"
788 "s390_pci_mmio_read\0"
789 "s390_pci_mmio_write\0"
790 },
791 [SYSCALL_FILTER_SET_REBOOT] = {
792 .name = "@reboot",
793 .help = "Reboot and reboot preparation/kexec",
794 .value =
795 "kexec_file_load\0"
796 "kexec_load\0"
797 "reboot\0"
798 },
799 [SYSCALL_FILTER_SET_RESOURCES] = {
800 .name = "@resources",
801 .help = "Alter resource settings",
802 .value =
803 "ioprio_set\0"
804 "mbind\0"
805 "migrate_pages\0"
806 "move_pages\0"
807 "nice\0"
808 "sched_setaffinity\0"
809 "sched_setattr\0"
810 "sched_setparam\0"
811 "sched_setscheduler\0"
812 "set_mempolicy\0"
813 "set_mempolicy_home_node\0"
814 "setpriority\0"
815 "setrlimit\0"
816 },
817 [SYSCALL_FILTER_SET_SANDBOX] = {
818 .name = "@sandbox",
819 .help = "Sandbox functionality",
820 .value =
821 "landlock_add_rule\0"
822 "landlock_create_ruleset\0"
823 "landlock_restrict_self\0"
824 "seccomp\0"
825 },
826 [SYSCALL_FILTER_SET_SETUID] = {
827 .name = "@setuid",
828 .help = "Operations for changing user/group credentials",
829 .value =
830 "setgid\0"
831 "setgid32\0"
832 "setgroups\0"
833 "setgroups32\0"
834 "setregid\0"
835 "setregid32\0"
836 "setresgid\0"
837 "setresgid32\0"
838 "setresuid\0"
839 "setresuid32\0"
840 "setreuid\0"
841 "setreuid32\0"
842 "setuid\0"
843 "setuid32\0"
844 },
845 [SYSCALL_FILTER_SET_SIGNAL] = {
846 .name = "@signal",
847 .help = "Process signal handling",
848 .value =
849 "rt_sigaction\0"
850 "rt_sigpending\0"
851 "rt_sigprocmask\0"
852 "rt_sigsuspend\0"
853 "rt_sigtimedwait\0"
854 "rt_sigtimedwait_time64\0"
855 "sigaction\0"
856 "sigaltstack\0"
857 "signal\0"
858 "signalfd\0"
859 "signalfd4\0"
860 "sigpending\0"
861 "sigprocmask\0"
862 "sigsuspend\0"
863 },
864 [SYSCALL_FILTER_SET_SWAP] = {
865 .name = "@swap",
866 .help = "Enable/disable swap devices",
867 .value =
868 "swapoff\0"
869 "swapon\0"
870 },
871 [SYSCALL_FILTER_SET_SYNC] = {
872 .name = "@sync",
873 .help = "Synchronize files and memory to storage",
874 .value =
875 "fdatasync\0"
876 "fsync\0"
877 "msync\0"
878 "sync\0"
879 "sync_file_range\0"
880 "sync_file_range2\0"
881 "syncfs\0"
882 },
883 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
884 .name = "@system-service",
885 .help = "General system service operations",
886 .value =
887 "@aio\0"
888 "@basic-io\0"
889 "@chown\0"
890 "@default\0"
891 "@file-system\0"
892 "@io-event\0"
893 "@ipc\0"
894 "@keyring\0"
895 "@memlock\0"
896 "@network-io\0"
897 "@process\0"
898 "@resources\0"
899 "@setuid\0"
900 "@signal\0"
901 "@sync\0"
902 "@timer\0"
903 "arm_fadvise64_64\0"
904 "capget\0"
905 "capset\0"
906 "copy_file_range\0"
907 "fadvise64\0"
908 "fadvise64_64\0"
909 "flock\0"
910 "get_mempolicy\0"
911 "getcpu\0"
912 "getpriority\0"
913 "ioctl\0"
914 "ioprio_get\0"
915 "kcmp\0"
916 "madvise\0"
917 "mremap\0"
918 "name_to_handle_at\0"
919 "oldolduname\0"
920 "olduname\0"
921 "personality\0"
922 "readahead\0"
923 "readdir\0"
924 "remap_file_pages\0"
925 "sched_get_priority_max\0"
926 "sched_get_priority_min\0"
927 "sched_getattr\0"
928 "sched_getparam\0"
929 "sched_getscheduler\0"
930 "sched_rr_get_interval\0"
931 "sched_rr_get_interval_time64\0"
932 "sched_yield\0"
933 "sendfile\0"
934 "sendfile64\0"
935 "setfsgid\0"
936 "setfsgid32\0"
937 "setfsuid\0"
938 "setfsuid32\0"
939 "setpgid\0"
940 "setsid\0"
941 "splice\0"
942 "sysinfo\0"
943 "tee\0"
944 "umask\0"
945 "uname\0"
946 "userfaultfd\0"
947 "vmsplice\0"
948 },
949 [SYSCALL_FILTER_SET_TIMER] = {
950 .name = "@timer",
951 .help = "Schedule operations by time",
952 .value =
953 "alarm\0"
954 "getitimer\0"
955 "setitimer\0"
956 "timer_create\0"
957 "timer_delete\0"
958 "timer_getoverrun\0"
959 "timer_gettime\0"
960 "timer_gettime64\0"
961 "timer_settime\0"
962 "timer_settime64\0"
963 "timerfd_create\0"
964 "timerfd_gettime\0"
965 "timerfd_gettime64\0"
966 "timerfd_settime\0"
967 "timerfd_settime64\0"
968 "times\0"
969 },
970 [SYSCALL_FILTER_SET_KNOWN] = {
971 .name = "@known",
972 .help = "All known syscalls declared in the kernel",
973 .value =
974 "@obsolete\0"
975 #include "syscall-list.h"
976 },
977 };
978
979 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
980 if (isempty(name) || name[0] != '@')
981 return NULL;
982
983 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
984 if (streq(syscall_filter_sets[i].name, name))
985 return syscall_filter_sets + i;
986
987 return NULL;
988 }
989
990 static int add_syscall_filter_set(
991 scmp_filter_ctx seccomp,
992 const SyscallFilterSet *set,
993 uint32_t action,
994 char **exclude,
995 bool log_missing,
996 char ***added);
997
998 int seccomp_add_syscall_filter_item(
999 scmp_filter_ctx *seccomp,
1000 const char *name,
1001 uint32_t action,
1002 char **exclude,
1003 bool log_missing,
1004 char ***added) {
1005
1006 assert(seccomp);
1007 assert(name);
1008
1009 if (strv_contains(exclude, name))
1010 return 0;
1011
1012 /* Any syscalls that are handled are added to the *added strv. The pointer
1013 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
1014
1015 if (name[0] == '@') {
1016 const SyscallFilterSet *other;
1017
1018 other = syscall_filter_set_find(name);
1019 if (!other)
1020 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
1021 "Filter set %s is not known!",
1022 name);
1023
1024 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
1025
1026 } else {
1027 int id, r;
1028
1029 id = seccomp_syscall_resolve_name(name);
1030 if (id == __NR_SCMP_ERROR) {
1031 if (log_missing)
1032 log_debug("System call %s is not known, ignoring.", name);
1033 return 0;
1034 }
1035
1036 r = seccomp_rule_add_exact(seccomp, action, id, 0);
1037 if (r < 0) {
1038 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1039 bool ignore = r == -EDOM;
1040
1041 if (!ignore || log_missing)
1042 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1043 name, id, ignore ? ", ignoring" : "");
1044 if (!ignore)
1045 return r;
1046 }
1047
1048 if (added) {
1049 r = strv_extend(added, name);
1050 if (r < 0)
1051 return r;
1052 }
1053
1054 return 0;
1055 }
1056 }
1057
1058 static int add_syscall_filter_set(
1059 scmp_filter_ctx seccomp,
1060 const SyscallFilterSet *set,
1061 uint32_t action,
1062 char **exclude,
1063 bool log_missing,
1064 char ***added) {
1065
1066 int r;
1067
1068 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1069
1070 assert(seccomp);
1071 assert(set);
1072
1073 NULSTR_FOREACH(sys, set->value) {
1074 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1075 if (r < 0)
1076 return r;
1077 }
1078
1079 return 0;
1080 }
1081
1082 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1083 uint32_t arch;
1084 int r;
1085
1086 assert(set);
1087
1088 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1089 * each local arch. */
1090
1091 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1092 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1093
1094 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1095
1096 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1097 if (r < 0)
1098 return r;
1099
1100 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1101 if (r < 0)
1102 return log_debug_errno(r, "Failed to add filter set: %m");
1103
1104 r = seccomp_load(seccomp);
1105 if (r < 0) {
1106 if (ERRNO_IS_SECCOMP_FATAL(r))
1107 return r;
1108 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1109 }
1110 }
1111
1112 return 0;
1113 }
1114
1115 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
1116 uint32_t arch;
1117 int r;
1118
1119 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1120 * of a SyscallFilterSet* table. */
1121
1122 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
1123 return 0;
1124
1125 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1126 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1127 void *syscall_id, *val;
1128
1129 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1130
1131 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1132 if (r < 0)
1133 return r;
1134
1135 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
1136 uint32_t a = action;
1137 int id = PTR_TO_INT(syscall_id) - 1;
1138 int error = PTR_TO_INT(val);
1139
1140 if (error == SECCOMP_ERROR_NUMBER_KILL)
1141 a = scmp_act_kill_process();
1142 #ifdef SCMP_ACT_LOG
1143 else if (action == SCMP_ACT_LOG)
1144 a = SCMP_ACT_LOG;
1145 #endif
1146 else if (error >= 0)
1147 a = SCMP_ACT_ERRNO(error);
1148
1149 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1150 if (r < 0) {
1151 /* If the system call is not known on this architecture, then that's
1152 * fine, let's ignore it */
1153 _cleanup_free_ char *n = NULL;
1154 bool ignore;
1155
1156 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1157 ignore = r == -EDOM;
1158 if (!ignore || log_missing)
1159 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1160 strna(n), id, ignore ? ", ignoring" : "");
1161 if (!ignore)
1162 return r;
1163 }
1164 }
1165
1166 r = seccomp_load(seccomp);
1167 if (r < 0) {
1168 if (ERRNO_IS_SECCOMP_FATAL(r))
1169 return r;
1170 log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
1171 seccomp_arch_to_string(arch));
1172 }
1173 }
1174
1175 return 0;
1176 }
1177
1178 int seccomp_parse_syscall_filter(
1179 const char *name,
1180 int errno_num,
1181 Hashmap *filter,
1182 SeccompParseFlags flags,
1183 const char *unit,
1184 const char *filename,
1185 unsigned line) {
1186
1187 int r;
1188
1189 assert(name);
1190 assert(filter);
1191
1192 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1193 return -EINVAL;
1194
1195 if (name[0] == '@') {
1196 const SyscallFilterSet *set;
1197
1198 set = syscall_filter_set_find(name);
1199 if (!set) {
1200 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1201 return -EINVAL;
1202
1203 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1204 "Unknown system call group, ignoring: %s", name);
1205 return 0;
1206 }
1207
1208 NULSTR_FOREACH(i, set->value) {
1209 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1210 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1211 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1212 * about them. */
1213 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1214 if (r < 0)
1215 return r;
1216 }
1217 } else {
1218 int id;
1219
1220 id = seccomp_syscall_resolve_name(name);
1221 if (id == __NR_SCMP_ERROR) {
1222 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1223 return -EINVAL;
1224
1225 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1226 "Failed to parse system call, ignoring: %s", name);
1227 return 0;
1228 }
1229
1230 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1231 * it from the list. The entries in allow-list with non-negative error value will be
1232 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1233 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1234 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
1235 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1236 if (r < 0)
1237 switch (r) {
1238 case -ENOMEM:
1239 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
1240 case -EEXIST:
1241 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1242 break;
1243 default:
1244 return r;
1245 }
1246 } else
1247 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1248 }
1249
1250 return 0;
1251 }
1252
1253 int seccomp_restrict_namespaces(unsigned long retain) {
1254 uint32_t arch;
1255 int r;
1256
1257 if (DEBUG_LOGGING) {
1258 _cleanup_free_ char *s = NULL;
1259
1260 (void) namespace_flags_to_string(retain, &s);
1261 log_debug("Restricting namespace to: %s.", strna(s));
1262 }
1263
1264 /* NOOP? */
1265 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1266 return 0;
1267
1268 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1269 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1270
1271 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1272
1273 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1274 if (r < 0)
1275 return r;
1276
1277 /* We cannot filter on individual flags to clone3(), and we need to disable the
1278 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1279 * users shall fall back to clone(), as if on an older kernel.
1280 *
1281 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1282 * https://github.com/moby/moby/issues/42680. */
1283
1284 r = seccomp_rule_add_exact(
1285 seccomp,
1286 SCMP_ACT_ERRNO(ENOSYS),
1287 SCMP_SYS(clone3),
1288 0);
1289 if (r < 0)
1290 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1291
1292 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1293 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1294 * altogether. */
1295 r = seccomp_rule_add_exact(
1296 seccomp,
1297 SCMP_ACT_ERRNO(EPERM),
1298 SCMP_SYS(setns),
1299 0);
1300 else
1301 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1302 * special invocation with a zero flags argument, right here. */
1303 r = seccomp_rule_add_exact(
1304 seccomp,
1305 SCMP_ACT_ERRNO(EPERM),
1306 SCMP_SYS(setns),
1307 1,
1308 SCMP_A1(SCMP_CMP_EQ, 0));
1309 if (r < 0) {
1310 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1311 continue;
1312 }
1313
1314 for (unsigned i = 0; namespace_info[i].proc_name; i++) {
1315 unsigned long f;
1316
1317 f = namespace_info[i].clone_flag;
1318 if (FLAGS_SET(retain, f)) {
1319 log_debug("Permitting %s.", namespace_info[i].proc_name);
1320 continue;
1321 }
1322
1323 log_trace("Blocking %s.", namespace_info[i].proc_name);
1324
1325 r = seccomp_rule_add_exact(
1326 seccomp,
1327 SCMP_ACT_ERRNO(EPERM),
1328 SCMP_SYS(unshare),
1329 1,
1330 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1331 if (r < 0) {
1332 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1333 break;
1334 }
1335
1336 /* On s390/s390x the first two parameters to clone are switched */
1337 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1338 r = seccomp_rule_add_exact(
1339 seccomp,
1340 SCMP_ACT_ERRNO(EPERM),
1341 SCMP_SYS(clone),
1342 1,
1343 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1344 else
1345 r = seccomp_rule_add_exact(
1346 seccomp,
1347 SCMP_ACT_ERRNO(EPERM),
1348 SCMP_SYS(clone),
1349 1,
1350 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1351 if (r < 0) {
1352 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1353 break;
1354 }
1355
1356 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1357 r = seccomp_rule_add_exact(
1358 seccomp,
1359 SCMP_ACT_ERRNO(EPERM),
1360 SCMP_SYS(setns),
1361 1,
1362 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1363 if (r < 0) {
1364 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1365 break;
1366 }
1367 }
1368 }
1369 if (r < 0)
1370 continue;
1371
1372 r = seccomp_load(seccomp);
1373 if (r < 0) {
1374 if (ERRNO_IS_SECCOMP_FATAL(r))
1375 return r;
1376 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1377 }
1378 }
1379
1380 return 0;
1381 }
1382
1383 int seccomp_protect_sysctl(void) {
1384 uint32_t arch;
1385 int r;
1386
1387 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1388 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1389
1390 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1391
1392 if (IN_SET(arch,
1393 SCMP_ARCH_AARCH64,
1394 #ifdef SCMP_ARCH_LOONGARCH64
1395 SCMP_ARCH_LOONGARCH64,
1396 #endif
1397 #ifdef SCMP_ARCH_RISCV64
1398 SCMP_ARCH_RISCV64,
1399 #endif
1400 SCMP_ARCH_X32
1401 ))
1402 /* No _sysctl syscall */
1403 continue;
1404
1405 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1406 if (r < 0)
1407 return r;
1408
1409 r = seccomp_rule_add_exact(
1410 seccomp,
1411 SCMP_ACT_ERRNO(EPERM),
1412 SCMP_SYS(_sysctl),
1413 0);
1414 if (r < 0) {
1415 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1416 continue;
1417 }
1418
1419 r = seccomp_load(seccomp);
1420 if (r < 0) {
1421 if (ERRNO_IS_SECCOMP_FATAL(r))
1422 return r;
1423 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1424 }
1425 }
1426
1427 return 0;
1428 }
1429
1430 int seccomp_protect_syslog(void) {
1431 uint32_t arch;
1432 int r;
1433
1434 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1435 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1436
1437 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1438 if (r < 0)
1439 return r;
1440
1441 r = seccomp_rule_add_exact(
1442 seccomp,
1443 SCMP_ACT_ERRNO(EPERM),
1444 SCMP_SYS(syslog),
1445 0);
1446
1447 if (r < 0) {
1448 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1449 continue;
1450 }
1451
1452 r = seccomp_load(seccomp);
1453 if (r < 0) {
1454 if (ERRNO_IS_SECCOMP_FATAL(r))
1455 return r;
1456 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1457 }
1458 }
1459
1460 return 0;
1461 }
1462
1463 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1464 uint32_t arch;
1465 int r;
1466
1467 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1468 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1469 bool supported;
1470
1471 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1472
1473 switch (arch) {
1474
1475 case SCMP_ARCH_X86_64:
1476 case SCMP_ARCH_X32:
1477 case SCMP_ARCH_ARM:
1478 case SCMP_ARCH_AARCH64:
1479 #ifdef SCMP_ARCH_LOONGARCH64
1480 case SCMP_ARCH_LOONGARCH64:
1481 #endif
1482 case SCMP_ARCH_MIPSEL64N32:
1483 case SCMP_ARCH_MIPS64N32:
1484 case SCMP_ARCH_MIPSEL64:
1485 case SCMP_ARCH_MIPS64:
1486 #ifdef SCMP_ARCH_RISCV64
1487 case SCMP_ARCH_RISCV64:
1488 #endif
1489 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1490 supported = true;
1491 break;
1492
1493 case SCMP_ARCH_S390:
1494 case SCMP_ARCH_S390X:
1495 case SCMP_ARCH_X86:
1496 case SCMP_ARCH_MIPSEL:
1497 case SCMP_ARCH_MIPS:
1498 #ifdef SCMP_ARCH_PARISC
1499 case SCMP_ARCH_PARISC:
1500 #endif
1501 #ifdef SCMP_ARCH_PARISC64
1502 case SCMP_ARCH_PARISC64:
1503 #endif
1504 case SCMP_ARCH_PPC:
1505 case SCMP_ARCH_PPC64:
1506 case SCMP_ARCH_PPC64LE:
1507 default:
1508 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1509 * don't know */
1510 supported = false;
1511 break;
1512 }
1513
1514 if (!supported)
1515 continue;
1516
1517 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1518 if (r < 0)
1519 return r;
1520
1521 if (allow_list) {
1522 int first = 0, last = 0;
1523 void *afp;
1524
1525 /* If this is an allow list, we first block the address families that are out of
1526 * range and then everything that is not in the set. First, we find the lowest and
1527 * highest address family in the set. */
1528
1529 SET_FOREACH(afp, address_families) {
1530 int af = PTR_TO_INT(afp);
1531
1532 if (af <= 0 || af >= af_max())
1533 continue;
1534
1535 if (first == 0 || af < first)
1536 first = af;
1537
1538 if (last == 0 || af > last)
1539 last = af;
1540 }
1541
1542 assert((first == 0) == (last == 0));
1543
1544 if (first == 0) {
1545
1546 /* No entries in the valid range, block everything */
1547 r = seccomp_rule_add_exact(
1548 seccomp,
1549 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1550 SCMP_SYS(socket),
1551 0);
1552 if (r < 0) {
1553 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1554 continue;
1555 }
1556
1557 } else {
1558
1559 /* Block everything below the first entry */
1560 r = seccomp_rule_add_exact(
1561 seccomp,
1562 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1563 SCMP_SYS(socket),
1564 1,
1565 SCMP_A0(SCMP_CMP_LT, first));
1566 if (r < 0) {
1567 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1568 continue;
1569 }
1570
1571 /* Block everything above the last entry */
1572 r = seccomp_rule_add_exact(
1573 seccomp,
1574 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1575 SCMP_SYS(socket),
1576 1,
1577 SCMP_A0(SCMP_CMP_GT, last));
1578 if (r < 0) {
1579 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1580 continue;
1581 }
1582
1583 /* Block everything between the first and last entry */
1584 for (int af = 1; af < af_max(); af++) {
1585
1586 if (set_contains(address_families, INT_TO_PTR(af)))
1587 continue;
1588
1589 r = seccomp_rule_add_exact(
1590 seccomp,
1591 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1592 SCMP_SYS(socket),
1593 1,
1594 SCMP_A0(SCMP_CMP_EQ, af));
1595 if (r < 0)
1596 break;
1597 }
1598 if (r < 0) {
1599 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1600 continue;
1601 }
1602 }
1603
1604 } else {
1605 void *af;
1606
1607 /* If this is a deny list, then generate one rule for each address family that are
1608 * then combined in OR checks. */
1609
1610 SET_FOREACH(af, address_families) {
1611 r = seccomp_rule_add_exact(
1612 seccomp,
1613 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1614 SCMP_SYS(socket),
1615 1,
1616 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1617 if (r < 0)
1618 break;
1619 }
1620 if (r < 0) {
1621 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1622 continue;
1623 }
1624 }
1625
1626 r = seccomp_load(seccomp);
1627 if (r < 0) {
1628 if (ERRNO_IS_SECCOMP_FATAL(r))
1629 return r;
1630 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1631 }
1632 }
1633
1634 return 0;
1635 }
1636
1637 int seccomp_restrict_realtime_full(int error_code) {
1638 static const int permitted_policies[] = {
1639 SCHED_OTHER,
1640 SCHED_BATCH,
1641 SCHED_IDLE,
1642 };
1643
1644 int r, max_policy = 0;
1645 uint32_t arch;
1646 unsigned i;
1647
1648 assert(error_code > 0);
1649
1650 /* Determine the highest policy constant we want to allow */
1651 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1652 if (permitted_policies[i] > max_policy)
1653 max_policy = permitted_policies[i];
1654
1655 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1656 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1657 int p;
1658
1659 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1660
1661 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1662 if (r < 0)
1663 return r;
1664
1665 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1666 * allow list. */
1667 for (p = 0; p < max_policy; p++) {
1668 bool good = false;
1669
1670 /* Check if this is in the allow list. */
1671 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1672 if (permitted_policies[i] == p) {
1673 good = true;
1674 break;
1675 }
1676
1677 if (good)
1678 continue;
1679
1680 /* Deny this policy */
1681 r = seccomp_rule_add_exact(
1682 seccomp,
1683 SCMP_ACT_ERRNO(error_code),
1684 SCMP_SYS(sched_setscheduler),
1685 1,
1686 SCMP_A1(SCMP_CMP_EQ, p));
1687 if (r < 0) {
1688 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1689 continue;
1690 }
1691 }
1692
1693 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1694 * are unsigned here, hence no need no check for < 0 values. */
1695 r = seccomp_rule_add_exact(
1696 seccomp,
1697 SCMP_ACT_ERRNO(error_code),
1698 SCMP_SYS(sched_setscheduler),
1699 1,
1700 SCMP_A1(SCMP_CMP_GT, max_policy));
1701 if (r < 0) {
1702 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1703 continue;
1704 }
1705
1706 r = seccomp_load(seccomp);
1707 if (r < 0) {
1708 if (ERRNO_IS_SECCOMP_FATAL(r))
1709 return r;
1710 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1711 }
1712 }
1713
1714 return 0;
1715 }
1716
1717 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1718 uint32_t arch,
1719 int nr,
1720 unsigned arg_cnt,
1721 const struct scmp_arg_cmp arg) {
1722 int r;
1723
1724 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1725 if (r < 0) {
1726 _cleanup_free_ char *n = NULL;
1727
1728 n = seccomp_syscall_resolve_num_arch(arch, nr);
1729 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1730 strna(n),
1731 seccomp_arch_to_string(arch));
1732 }
1733
1734 return r;
1735 }
1736
1737 /* For known architectures, check that syscalls are indeed defined or not. */
1738 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64)
1739 assert_cc(SCMP_SYS(shmget) > 0);
1740 assert_cc(SCMP_SYS(shmat) > 0);
1741 assert_cc(SCMP_SYS(shmdt) > 0);
1742 #endif
1743
1744 int seccomp_memory_deny_write_execute(void) {
1745 uint32_t arch;
1746 unsigned loaded = 0;
1747
1748 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1749 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1750 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1751
1752 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1753
1754 switch (arch) {
1755
1756 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1757 * We ignore that here, which means there's still a way to get writable/executable
1758 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1759 *
1760 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1761 * on that front (kernel work done in 5.18).
1762 */
1763
1764 case SCMP_ARCH_X86:
1765 case SCMP_ARCH_S390:
1766 filter_syscall = SCMP_SYS(mmap2);
1767 block_syscall = SCMP_SYS(mmap);
1768 /* shmat multiplexed, see above */
1769 break;
1770
1771 case SCMP_ARCH_PPC:
1772 case SCMP_ARCH_PPC64:
1773 case SCMP_ARCH_PPC64LE:
1774 case SCMP_ARCH_S390X:
1775 filter_syscall = SCMP_SYS(mmap);
1776 /* shmat multiplexed, see above */
1777 break;
1778
1779 case SCMP_ARCH_ARM:
1780 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1781 shmat_syscall = SCMP_SYS(shmat);
1782 break;
1783
1784 case SCMP_ARCH_X86_64:
1785 case SCMP_ARCH_X32:
1786 case SCMP_ARCH_AARCH64:
1787 #ifdef SCMP_ARCH_LOONGARCH64
1788 case SCMP_ARCH_LOONGARCH64:
1789 #endif
1790 #ifdef SCMP_ARCH_RISCV64
1791 case SCMP_ARCH_RISCV64:
1792 #endif
1793 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */
1794 shmat_syscall = SCMP_SYS(shmat);
1795 break;
1796
1797 /* Please add more definitions here, if you port systemd to other architectures! */
1798
1799 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64)
1800 #warning "Consider adding the right mmap() syscall definitions here!"
1801 #endif
1802 }
1803
1804 /* Can't filter mmap() on this arch, then skip it */
1805 if (filter_syscall == 0)
1806 continue;
1807
1808 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1809 if (r < 0)
1810 return r;
1811
1812 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1813 1,
1814 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1815 if (r < 0)
1816 continue;
1817
1818 if (block_syscall != 0) {
1819 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1820 if (r < 0)
1821 continue;
1822 }
1823
1824 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1825 1,
1826 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1827 if (r < 0)
1828 continue;
1829
1830 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1831 1,
1832 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1833 if (r < 0)
1834 continue;
1835
1836 if (shmat_syscall > 0) {
1837 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1838 1,
1839 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1840 if (r < 0)
1841 continue;
1842 }
1843
1844 r = seccomp_load(seccomp);
1845 if (r < 0) {
1846 if (ERRNO_IS_SECCOMP_FATAL(r))
1847 return r;
1848 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1849 seccomp_arch_to_string(arch));
1850 }
1851 loaded++;
1852 }
1853
1854 if (loaded == 0)
1855 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1856
1857 return loaded;
1858 }
1859
1860 int seccomp_restrict_archs(Set *archs) {
1861 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1862 int r;
1863 bool blocked_new = false;
1864
1865 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1866 * list.
1867 *
1868 * There are some qualifications. However the most important use is to stop processes from bypassing
1869 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1870 * in a non-native architecture. There are no holes in this use case, at least so far. */
1871
1872 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1873 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1874 * to run a program with the restrictions applied. */
1875 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1876 if (!seccomp)
1877 return -ENOMEM;
1878
1879 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1880 uint32_t arch = seccomp_local_archs[i];
1881
1882 /* See above comment, our "native" architecture is never blocked. */
1883 if (arch == seccomp_arch_native())
1884 continue;
1885
1886 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1887 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1888 continue;
1889
1890 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
1891
1892 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1893 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1894 * The important thing is that you can block the old 32-bit x86 syscalls.
1895 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1896 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1897 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1898
1899 if (block) {
1900 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1901 blocked_new = true;
1902 } else {
1903 r = seccomp_arch_add(seccomp, arch);
1904 if (r < 0 && r != -EEXIST)
1905 return r;
1906 }
1907 }
1908
1909 /* All architectures that will be blocked by the seccomp program were
1910 * already blocked. */
1911 if (!blocked_new)
1912 return 0;
1913
1914 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1915 if (r < 0)
1916 return r;
1917
1918 r = seccomp_load(seccomp);
1919 if (r < 0) {
1920 if (ERRNO_IS_SECCOMP_FATAL(r))
1921 return r;
1922 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1923 }
1924
1925 return 0;
1926 }
1927
1928 int parse_syscall_archs(char **l, Set **ret_archs) {
1929 _cleanup_set_free_ Set *archs = NULL;
1930 int r;
1931
1932 assert(l);
1933 assert(ret_archs);
1934
1935 STRV_FOREACH(s, l) {
1936 uint32_t a;
1937
1938 r = seccomp_arch_from_string(*s, &a);
1939 if (r < 0)
1940 return -EINVAL;
1941
1942 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1943 if (r < 0)
1944 return -ENOMEM;
1945 }
1946
1947 *ret_archs = TAKE_PTR(archs);
1948 return 0;
1949 }
1950
1951 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1952 int r;
1953
1954 assert(set);
1955
1956 NULSTR_FOREACH(i, set->value) {
1957
1958 if (i[0] == '@') {
1959 const SyscallFilterSet *more;
1960
1961 more = syscall_filter_set_find(i);
1962 if (!more)
1963 return -ENXIO;
1964
1965 r = seccomp_filter_set_add(filter, add, more);
1966 if (r < 0)
1967 return r;
1968 } else {
1969 int id;
1970
1971 id = seccomp_syscall_resolve_name(i);
1972 if (id == __NR_SCMP_ERROR) {
1973 log_debug("Couldn't resolve system call, ignoring: %s", i);
1974 continue;
1975 }
1976
1977 if (add) {
1978 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1979 if (r < 0)
1980 return r;
1981 } else
1982 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1983 }
1984 }
1985
1986 return 0;
1987 }
1988
1989 int seccomp_lock_personality(unsigned long personality) {
1990 uint32_t arch;
1991 int r;
1992
1993 if (personality >= PERSONALITY_INVALID)
1994 return -EINVAL;
1995
1996 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1997 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1998
1999 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2000 if (r < 0)
2001 return r;
2002
2003 r = seccomp_rule_add_exact(
2004 seccomp,
2005 SCMP_ACT_ERRNO(EPERM),
2006 SCMP_SYS(personality),
2007 1,
2008 SCMP_A0(SCMP_CMP_NE, personality));
2009 if (r < 0) {
2010 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2011 continue;
2012 }
2013
2014 r = seccomp_load(seccomp);
2015 if (r < 0) {
2016 if (ERRNO_IS_SECCOMP_FATAL(r))
2017 return r;
2018 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2019 }
2020 }
2021
2022 return 0;
2023 }
2024
2025 int seccomp_protect_hostname(void) {
2026 uint32_t arch;
2027 int r;
2028
2029 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2030 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2031
2032 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2033 if (r < 0)
2034 return r;
2035
2036 r = seccomp_rule_add_exact(
2037 seccomp,
2038 SCMP_ACT_ERRNO(EPERM),
2039 SCMP_SYS(sethostname),
2040 0);
2041 if (r < 0) {
2042 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2043 continue;
2044 }
2045
2046 r = seccomp_rule_add_exact(
2047 seccomp,
2048 SCMP_ACT_ERRNO(EPERM),
2049 SCMP_SYS(setdomainname),
2050 0);
2051 if (r < 0) {
2052 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2053 continue;
2054 }
2055
2056 r = seccomp_load(seccomp);
2057 if (r < 0) {
2058 if (ERRNO_IS_SECCOMP_FATAL(r))
2059 return r;
2060 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2061 }
2062 }
2063
2064 return 0;
2065 }
2066
2067 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
2068 /* Checks the mode_t parameter of the following system calls:
2069 *
2070 * → chmod() + fchmod() + fchmodat()
2071 * → open() + creat() + openat()
2072 * → mkdir() + mkdirat()
2073 * → mknod() + mknodat()
2074 *
2075 * Returns error if *everything* failed, and 0 otherwise.
2076 */
2077 int r;
2078 bool any = false;
2079
2080 r = seccomp_rule_add_exact(
2081 seccomp,
2082 SCMP_ACT_ERRNO(EPERM),
2083 SCMP_SYS(chmod),
2084 1,
2085 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2086 if (r < 0)
2087 log_debug_errno(r, "Failed to add filter for chmod: %m");
2088 else
2089 any = true;
2090
2091 r = seccomp_rule_add_exact(
2092 seccomp,
2093 SCMP_ACT_ERRNO(EPERM),
2094 SCMP_SYS(fchmod),
2095 1,
2096 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2097 if (r < 0)
2098 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2099 else
2100 any = true;
2101
2102 r = seccomp_rule_add_exact(
2103 seccomp,
2104 SCMP_ACT_ERRNO(EPERM),
2105 SCMP_SYS(fchmodat),
2106 1,
2107 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2108 if (r < 0)
2109 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2110 else
2111 any = true;
2112
2113 r = seccomp_rule_add_exact(
2114 seccomp,
2115 SCMP_ACT_ERRNO(EPERM),
2116 SCMP_SYS(mkdir),
2117 1,
2118 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2119 if (r < 0)
2120 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2121 else
2122 any = true;
2123
2124 r = seccomp_rule_add_exact(
2125 seccomp,
2126 SCMP_ACT_ERRNO(EPERM),
2127 SCMP_SYS(mkdirat),
2128 1,
2129 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2130 if (r < 0)
2131 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2132 else
2133 any = true;
2134
2135 r = seccomp_rule_add_exact(
2136 seccomp,
2137 SCMP_ACT_ERRNO(EPERM),
2138 SCMP_SYS(mknod),
2139 1,
2140 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2141 if (r < 0)
2142 log_debug_errno(r, "Failed to add filter for mknod: %m");
2143 else
2144 any = true;
2145
2146 r = seccomp_rule_add_exact(
2147 seccomp,
2148 SCMP_ACT_ERRNO(EPERM),
2149 SCMP_SYS(mknodat),
2150 1,
2151 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2152 if (r < 0)
2153 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2154 else
2155 any = true;
2156
2157 r = seccomp_rule_add_exact(
2158 seccomp,
2159 SCMP_ACT_ERRNO(EPERM),
2160 SCMP_SYS(open),
2161 2,
2162 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2163 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2164 if (r < 0)
2165 log_debug_errno(r, "Failed to add filter for open: %m");
2166 else
2167 any = true;
2168
2169 r = seccomp_rule_add_exact(
2170 seccomp,
2171 SCMP_ACT_ERRNO(EPERM),
2172 SCMP_SYS(openat),
2173 2,
2174 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2175 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2176 if (r < 0)
2177 log_debug_errno(r, "Failed to add filter for openat: %m");
2178 else
2179 any = true;
2180
2181 #if defined(__SNR_openat2)
2182 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2183 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2184 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2185 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2186 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2187 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2188 r = seccomp_rule_add_exact(
2189 seccomp,
2190 SCMP_ACT_ERRNO(ENOSYS),
2191 SCMP_SYS(openat2),
2192 0);
2193 if (r < 0)
2194 log_debug_errno(r, "Failed to add filter for openat2: %m");
2195 else
2196 any = true;
2197 #endif
2198
2199 r = seccomp_rule_add_exact(
2200 seccomp,
2201 SCMP_ACT_ERRNO(EPERM),
2202 SCMP_SYS(creat),
2203 1,
2204 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2205 if (r < 0)
2206 log_debug_errno(r, "Failed to add filter for creat: %m");
2207 else
2208 any = true;
2209
2210 return any ? 0 : r;
2211 }
2212
2213 int seccomp_restrict_suid_sgid(void) {
2214 uint32_t arch;
2215 int r, k;
2216
2217 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2218 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2219
2220 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2221 if (r < 0)
2222 return r;
2223
2224 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2225 if (r < 0)
2226 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2227
2228 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2229 if (k < 0)
2230 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2231
2232 if (r < 0 && k < 0)
2233 continue;
2234
2235 r = seccomp_load(seccomp);
2236 if (r < 0) {
2237 if (ERRNO_IS_SECCOMP_FATAL(r))
2238 return r;
2239 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2240 }
2241 }
2242
2243 return 0;
2244 }
2245
2246 uint32_t scmp_act_kill_process(void) {
2247
2248 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2249 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2250 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2251 * for single-threaded apps does the right thing. */
2252
2253 #ifdef SCMP_ACT_KILL_PROCESS
2254 if (seccomp_api_get() >= 3)
2255 return SCMP_ACT_KILL_PROCESS;
2256 #endif
2257
2258 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2259 }
2260
2261 int parse_syscall_and_errno(const char *in, char **name, int *error) {
2262 _cleanup_free_ char *n = NULL;
2263 char *p;
2264 int e = -1;
2265
2266 assert(in);
2267 assert(name);
2268 assert(error);
2269
2270 /*
2271 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2272 * If errno is omitted, then error is set to -1.
2273 * Empty syscall name is not allowed.
2274 * Here, we do not check that the syscall name is valid or not.
2275 */
2276
2277 p = strchr(in, ':');
2278 if (p) {
2279 e = seccomp_parse_errno_or_action(p + 1);
2280 if (e < 0)
2281 return e;
2282
2283 n = strndup(in, p - in);
2284 } else
2285 n = strdup(in);
2286
2287 if (!n)
2288 return -ENOMEM;
2289
2290 if (isempty(n))
2291 return -EINVAL;
2292
2293 *error = e;
2294 *name = TAKE_PTR(n);
2295
2296 return 0;
2297 }
2298
2299 static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2300 bool any = false;
2301 int r;
2302
2303 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2304 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2305
2306 r = seccomp_rule_add_exact(
2307 seccomp,
2308 SCMP_ACT_ERRNO(EINVAL),
2309 SCMP_SYS(open),
2310 1,
2311 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2312 if (r < 0)
2313 log_debug_errno(r, "Failed to add filter for open: %m");
2314 else
2315 any = true;
2316
2317 r = seccomp_rule_add_exact(
2318 seccomp,
2319 SCMP_ACT_ERRNO(EINVAL),
2320 SCMP_SYS(openat),
2321 1,
2322 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2323 if (r < 0)
2324 log_debug_errno(r, "Failed to add filter for openat: %m");
2325 else
2326 any = true;
2327
2328 #if defined(__SNR_openat2)
2329 /* The new openat2() system call can't be filtered sensibly, see above. */
2330 r = seccomp_rule_add_exact(
2331 seccomp,
2332 SCMP_ACT_ERRNO(ENOSYS),
2333 SCMP_SYS(openat2),
2334 0);
2335 if (r < 0)
2336 log_debug_errno(r, "Failed to add filter for openat2: %m");
2337 else
2338 any = true;
2339 #endif
2340
2341 return any ? 0 : r;
2342 }
2343
2344 int seccomp_suppress_sync(void) {
2345 uint32_t arch;
2346 int r;
2347
2348 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2349 * manageable, and also masks O_SYNC/O_DSYNC */
2350
2351 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2352 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2353
2354 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2355 if (r < 0)
2356 return r;
2357
2358 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2359 int id;
2360
2361 id = seccomp_syscall_resolve_name(c);
2362 if (id == __NR_SCMP_ERROR) {
2363 log_debug("System call %s is not known, ignoring.", c);
2364 continue;
2365 }
2366
2367 r = seccomp_rule_add_exact(
2368 seccomp,
2369 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2370 id,
2371 0);
2372 if (r < 0)
2373 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2374 }
2375
2376 (void) block_open_flag(seccomp, O_SYNC);
2377 #if O_DSYNC != O_SYNC
2378 (void) block_open_flag(seccomp, O_DSYNC);
2379 #endif
2380
2381 r = seccomp_load(seccomp);
2382 if (r < 0) {
2383 if (ERRNO_IS_SECCOMP_FATAL(r))
2384 return r;
2385 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2386 }
2387 }
2388
2389 return 0;
2390 }