]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
fuzz: unify logging setup
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
a8fbdf54 6#include <stddef.h>
469830d1 7#include <sys/mman.h>
d347d902 8#include <sys/prctl.h>
469830d1 9#include <sys/shm.h>
3c27973b 10#include <sys/stat.h>
57183d11 11
e83156c2
YW
12/* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13#include "missing_syscall_def.h"
14#include <seccomp.h>
15
469830d1 16#include "af-list.h"
add00535 17#include "alloc-util.h"
44aaddad 18#include "env-util.h"
d8b4d14d 19#include "errno-list.h"
a8fbdf54 20#include "macro.h"
241b1577 21#include "namespace-util.h"
add00535 22#include "nsflags.h"
d8b4d14d 23#include "nulstr-util.h"
78e864e5 24#include "process-util.h"
cf0fbc49 25#include "seccomp-util.h"
b16bd535 26#include "set.h"
07630cea 27#include "string-util.h"
b16bd535 28#include "strv.h"
469830d1 29
65976868
GDF
30/* This array will be modified at runtime as seccomp_restrict_archs is called. */
31uint32_t seccomp_local_archs[] = {
469830d1 32
6b000af4 33 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
34
35#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
36 SCMP_ARCH_X86,
37 SCMP_ARCH_X86_64,
f2d9751c
LP
38 SCMP_ARCH_X32, /* native */
39#elif defined(__x86_64__) && !defined(__ILP32__)
40 SCMP_ARCH_X86,
469830d1 41 SCMP_ARCH_X32,
f2d9751c
LP
42 SCMP_ARCH_X86_64, /* native */
43#elif defined(__i386__)
44 SCMP_ARCH_X86,
45#elif defined(__aarch64__)
469830d1 46 SCMP_ARCH_ARM,
f2d9751c
LP
47 SCMP_ARCH_AARCH64, /* native */
48#elif defined(__arm__)
49 SCMP_ARCH_ARM,
f9d3fb6b
XW
50#elif defined(__loongarch_lp64)
51 SCMP_ARCH_LOONGARCH64,
f2d9751c
LP
52#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
53 SCMP_ARCH_MIPSEL,
54 SCMP_ARCH_MIPS, /* native */
55#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 56 SCMP_ARCH_MIPS,
f2d9751c
LP
57 SCMP_ARCH_MIPSEL, /* native */
58#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS,
61 SCMP_ARCH_MIPSEL64N32,
469830d1 62 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
63 SCMP_ARCH_MIPSEL64,
64 SCMP_ARCH_MIPS64, /* native */
65#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPS,
469830d1 67 SCMP_ARCH_MIPSEL,
f2d9751c
LP
68 SCMP_ARCH_MIPS64N32,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64,
71 SCMP_ARCH_MIPSEL64, /* native */
72#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS,
469830d1 75 SCMP_ARCH_MIPSEL64,
f2d9751c 76 SCMP_ARCH_MIPS64,
469830d1 77 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
78 SCMP_ARCH_MIPS64N32, /* native */
79#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPS,
81 SCMP_ARCH_MIPSEL,
82 SCMP_ARCH_MIPS64,
83 SCMP_ARCH_MIPSEL64,
84 SCMP_ARCH_MIPS64N32,
85 SCMP_ARCH_MIPSEL64N32, /* native */
344e6b62
SJ
86#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
87 SCMP_ARCH_PARISC,
88 SCMP_ARCH_PARISC64, /* native */
89#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
90 SCMP_ARCH_PARISC,
f2d9751c 91#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 92 SCMP_ARCH_PPC,
469830d1 93 SCMP_ARCH_PPC64LE,
f2d9751c
LP
94 SCMP_ARCH_PPC64, /* native */
95#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
96 SCMP_ARCH_PPC,
97 SCMP_ARCH_PPC64,
98 SCMP_ARCH_PPC64LE, /* native */
99#elif defined(__powerpc__)
100 SCMP_ARCH_PPC,
f9252236
AJ
101#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
102 SCMP_ARCH_RISCV64,
f2d9751c
LP
103#elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106#elif defined(__s390__)
469830d1 107 SCMP_ARCH_S390,
469830d1 108#endif
65976868 109 SECCOMP_LOCAL_ARCH_END
469830d1 110 };
57183d11
LP
111
112const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
57183d11 117
79893116 118 switch (c) {
aa34055f 119 case SCMP_ARCH_NATIVE:
57183d11 120 return "native";
aa34055f 121 case SCMP_ARCH_X86:
57183d11 122 return "x86";
aa34055f 123 case SCMP_ARCH_X86_64:
57183d11 124 return "x86-64";
aa34055f 125 case SCMP_ARCH_X32:
57183d11 126 return "x32";
aa34055f 127 case SCMP_ARCH_ARM:
57183d11 128 return "arm";
aa34055f
ZJS
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
f9d3fb6b
XW
131#ifdef SCMP_ARCH_LOONGARCH64
132 case SCMP_ARCH_LOONGARCH64:
133 return "loongarch64";
134#endif
aa34055f
ZJS
135 case SCMP_ARCH_MIPS:
136 return "mips";
137 case SCMP_ARCH_MIPS64:
138 return "mips64";
139 case SCMP_ARCH_MIPS64N32:
140 return "mips64-n32";
141 case SCMP_ARCH_MIPSEL:
142 return "mips-le";
143 case SCMP_ARCH_MIPSEL64:
144 return "mips64-le";
145 case SCMP_ARCH_MIPSEL64N32:
146 return "mips64-le-n32";
344e6b62
SJ
147#ifdef SCMP_ARCH_PARISC
148 case SCMP_ARCH_PARISC:
149 return "parisc";
150#endif
151#ifdef SCMP_ARCH_PARISC64
152 case SCMP_ARCH_PARISC64:
153 return "parisc64";
154#endif
aa34055f
ZJS
155 case SCMP_ARCH_PPC:
156 return "ppc";
157 case SCMP_ARCH_PPC64:
158 return "ppc64";
159 case SCMP_ARCH_PPC64LE:
160 return "ppc64-le";
f9252236
AJ
161#ifdef SCMP_ARCH_RISCV64
162 case SCMP_ARCH_RISCV64:
163 return "riscv64";
164#endif
aa34055f 165 case SCMP_ARCH_S390:
6abfd303 166 return "s390";
aa34055f 167 case SCMP_ARCH_S390X:
6abfd303 168 return "s390x";
aa34055f
ZJS
169 default:
170 return NULL;
171 }
57183d11
LP
172}
173
174int seccomp_arch_from_string(const char *n, uint32_t *ret) {
175 if (!n)
176 return -EINVAL;
177
178 assert(ret);
179
180 if (streq(n, "native"))
181 *ret = SCMP_ARCH_NATIVE;
182 else if (streq(n, "x86"))
183 *ret = SCMP_ARCH_X86;
184 else if (streq(n, "x86-64"))
185 *ret = SCMP_ARCH_X86_64;
186 else if (streq(n, "x32"))
187 *ret = SCMP_ARCH_X32;
188 else if (streq(n, "arm"))
189 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
190 else if (streq(n, "arm64"))
191 *ret = SCMP_ARCH_AARCH64;
f9d3fb6b
XW
192#ifdef SCMP_ARCH_LOONGARCH64
193 else if (streq(n, "loongarch64"))
194 *ret = SCMP_ARCH_LOONGARCH64;
195#endif
aa34055f
ZJS
196 else if (streq(n, "mips"))
197 *ret = SCMP_ARCH_MIPS;
198 else if (streq(n, "mips64"))
199 *ret = SCMP_ARCH_MIPS64;
200 else if (streq(n, "mips64-n32"))
201 *ret = SCMP_ARCH_MIPS64N32;
202 else if (streq(n, "mips-le"))
203 *ret = SCMP_ARCH_MIPSEL;
204 else if (streq(n, "mips64-le"))
205 *ret = SCMP_ARCH_MIPSEL64;
206 else if (streq(n, "mips64-le-n32"))
207 *ret = SCMP_ARCH_MIPSEL64N32;
344e6b62
SJ
208#ifdef SCMP_ARCH_PARISC
209 else if (streq(n, "parisc"))
210 *ret = SCMP_ARCH_PARISC;
211#endif
212#ifdef SCMP_ARCH_PARISC64
213 else if (streq(n, "parisc64"))
214 *ret = SCMP_ARCH_PARISC64;
215#endif
aa34055f
ZJS
216 else if (streq(n, "ppc"))
217 *ret = SCMP_ARCH_PPC;
218 else if (streq(n, "ppc64"))
219 *ret = SCMP_ARCH_PPC64;
220 else if (streq(n, "ppc64-le"))
221 *ret = SCMP_ARCH_PPC64LE;
f9252236
AJ
222#ifdef SCMP_ARCH_RISCV64
223 else if (streq(n, "riscv64"))
224 *ret = SCMP_ARCH_RISCV64;
225#endif
6abfd303
HB
226 else if (streq(n, "s390"))
227 *ret = SCMP_ARCH_S390;
228 else if (streq(n, "s390x"))
229 *ret = SCMP_ARCH_S390X;
57183d11
LP
230 else
231 return -EINVAL;
232
233 return 0;
234}
e9642be2 235
469830d1 236int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 237 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
238 int r;
239
469830d1
LP
240 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
241 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
242
243 seccomp = seccomp_init(default_action);
244 if (!seccomp)
245 return -ENOMEM;
246
469830d1
LP
247 if (arch != SCMP_ARCH_NATIVE &&
248 arch != seccomp_arch_native()) {
249
1b52793d 250 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 251 if (r < 0)
b4eaa6cc 252 return r;
469830d1 253
1b52793d 254 r = seccomp_arch_add(seccomp, arch);
469830d1 255 if (r < 0)
b4eaa6cc 256 return r;
469830d1
LP
257
258 assert(seccomp_arch_exist(seccomp, arch) >= 0);
259 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
260 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
261 } else {
262 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
263 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
264 }
265
266 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 267 if (r < 0)
b4eaa6cc 268 return r;
8d7b0c8f
LP
269
270 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
271 if (r < 0)
b4eaa6cc 272 return r;
8d7b0c8f 273
44aaddad
SD
274#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
275 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
276 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
277 if (r < 0)
278 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
279 }
280#endif
281
b4eaa6cc 282 *ret = TAKE_PTR(seccomp);
8d7b0c8f 283 return 0;
8d7b0c8f
LP
284}
285
d347d902 286static bool is_basic_seccomp_available(void) {
4d5bd50a 287 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
288}
289
290static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
291 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
292 errno == EFAULT;
d347d902
FS
293}
294
83f12b27 295bool is_seccomp_available(void) {
83f12b27 296 static int cached_enabled = -1;
4d5bd50a 297
ce8f6d47
LP
298 if (cached_enabled < 0) {
299 int b;
300
301 b = getenv_bool_secure("SYSTEMD_SECCOMP");
302 if (b != 0) {
303 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
304 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
305
306 cached_enabled =
307 is_basic_seccomp_available() &&
308 is_seccomp_filter_available();
309 } else
310 cached_enabled = false;
311 }
4d5bd50a 312
83f12b27
FS
313 return cached_enabled;
314}
315
8130926d 316const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 317 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 318 .name = "@default",
d5efc18b 319 .help = "System calls that are always permitted",
40eb6a80 320 .value =
5f02870a 321 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
5abede32 322 "brk\0"
8e24b1d2 323 "cacheflush\0"
40eb6a80 324 "clock_getres\0"
6ca67710 325 "clock_getres_time64\0"
40eb6a80 326 "clock_gettime\0"
6ca67710 327 "clock_gettime64\0"
40eb6a80 328 "clock_nanosleep\0"
6ca67710 329 "clock_nanosleep_time64\0"
40eb6a80
ZJS
330 "execve\0"
331 "exit\0"
332 "exit_group\0"
e41b0f42 333 "futex\0"
6ca67710 334 "futex_time64\0"
76e86b8d 335 "futex_waitv\0"
e41b0f42
LP
336 "get_robust_list\0"
337 "get_thread_area\0"
09d3020b
DH
338 "getegid\0"
339 "getegid32\0"
340 "geteuid\0"
341 "geteuid32\0"
342 "getgid\0"
343 "getgid32\0"
344 "getgroups\0"
345 "getgroups32\0"
346 "getpgid\0"
347 "getpgrp\0"
348 "getpid\0"
349 "getppid\0"
14f4b1b5 350 "getrandom\0"
09d3020b
DH
351 "getresgid\0"
352 "getresgid32\0"
353 "getresuid\0"
354 "getresuid32\0"
40eb6a80 355 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
356 "getsid\0"
357 "gettid\0"
40eb6a80 358 "gettimeofday\0"
09d3020b
DH
359 "getuid\0"
360 "getuid32\0"
e41b0f42 361 "membarrier\0"
5abede32
LP
362 "mmap\0"
363 "mmap2\0"
47286254 364 "mprotect\0"
11b9105d 365 "munmap\0"
40eb6a80
ZJS
366 "nanosleep\0"
367 "pause\0"
4c3a9176 368 "prlimit64\0"
e41b0f42 369 "restart_syscall\0"
09925036 370 "riscv_flush_icache\0"
ca15fc48 371 "riscv_hwprobe\0"
6fee3be0 372 "rseq\0"
40eb6a80 373 "rt_sigreturn\0"
7df660e4 374 "sched_getaffinity\0"
8f44de08 375 "sched_yield\0"
e41b0f42
LP
376 "set_robust_list\0"
377 "set_thread_area\0"
378 "set_tid_address\0"
ce5faeac 379 "set_tls\0"
40eb6a80
ZJS
380 "sigreturn\0"
381 "time\0"
4c3a9176 382 "ugetrlimit\0"
40eb6a80 383 },
44898c53
LP
384 [SYSCALL_FILTER_SET_AIO] = {
385 .name = "@aio",
386 .help = "Asynchronous IO",
387 .value =
388 "io_cancel\0"
389 "io_destroy\0"
390 "io_getevents\0"
a05cfe23 391 "io_pgetevents\0"
6ca67710 392 "io_pgetevents_time64\0"
44898c53
LP
393 "io_setup\0"
394 "io_submit\0"
9e486265
LP
395 "io_uring_enter\0"
396 "io_uring_register\0"
397 "io_uring_setup\0"
44898c53 398 },
133ddbbe 399 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 400 .name = "@basic-io",
d5efc18b 401 .help = "Basic IO",
133ddbbe 402 .value =
648a0ed0 403 "_llseek\0"
133ddbbe 404 "close\0"
6ea0d25c 405 "close_range\0"
648a0ed0 406 "dup\0"
133ddbbe
LP
407 "dup2\0"
408 "dup3\0"
133ddbbe
LP
409 "lseek\0"
410 "pread64\0"
411 "preadv\0"
44898c53 412 "preadv2\0"
133ddbbe
LP
413 "pwrite64\0"
414 "pwritev\0"
44898c53 415 "pwritev2\0"
133ddbbe
LP
416 "read\0"
417 "readv\0"
418 "write\0"
419 "writev\0"
420 },
44898c53
LP
421 [SYSCALL_FILTER_SET_CHOWN] = {
422 .name = "@chown",
423 .help = "Change ownership of files and directories",
424 .value =
425 "chown\0"
426 "chown32\0"
427 "fchown\0"
428 "fchown32\0"
429 "fchownat\0"
430 "lchown\0"
431 "lchown32\0"
432 },
8130926d 433 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 434 .name = "@clock",
d5efc18b 435 .help = "Change the system time",
201c1cc2
TM
436 .value =
437 "adjtimex\0"
1f9ac68b 438 "clock_adjtime\0"
6ca67710 439 "clock_adjtime64\0"
1f9ac68b 440 "clock_settime\0"
6ca67710 441 "clock_settime64\0"
201c1cc2 442 "settimeofday\0"
8130926d
LP
443 },
444 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 445 .name = "@cpu-emulation",
d5efc18b 446 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
447 .value =
448 "modify_ldt\0"
449 "subpage_prot\0"
450 "switch_endian\0"
451 "vm86\0"
452 "vm86old\0"
8130926d
LP
453 },
454 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 455 .name = "@debug",
d5efc18b 456 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
457 .value =
458 "lookup_dcookie\0"
459 "perf_event_open\0"
8270e3d8 460 "pidfd_getfd\0"
1f9ac68b
LP
461 "ptrace\0"
462 "rtas\0"
463 "s390_runtime_instr\0"
464 "sys_debug_setcontext\0"
8130926d 465 },
1a1b13c9
LP
466 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
467 .name = "@file-system",
468 .help = "File system operations",
469 .value =
470 "access\0"
471 "chdir\0"
472 "chmod\0"
473 "close\0"
474 "creat\0"
475 "faccessat\0"
bcf08acb 476 "faccessat2\0"
1a1b13c9
LP
477 "fallocate\0"
478 "fchdir\0"
479 "fchmod\0"
480 "fchmodat\0"
1a1b13c9 481 "fcntl\0"
ceaa6aa7 482 "fcntl64\0"
1a1b13c9
LP
483 "fgetxattr\0"
484 "flistxattr\0"
ceaa6aa7 485 "fremovexattr\0"
1a1b13c9 486 "fsetxattr\0"
1a1b13c9 487 "fstat\0"
ceaa6aa7 488 "fstat64\0"
1a1b13c9 489 "fstatat64\0"
1a1b13c9 490 "fstatfs\0"
ceaa6aa7 491 "fstatfs64\0"
1a1b13c9 492 "ftruncate\0"
ceaa6aa7 493 "ftruncate64\0"
1a1b13c9
LP
494 "futimesat\0"
495 "getcwd\0"
1a1b13c9 496 "getdents\0"
ceaa6aa7 497 "getdents64\0"
1a1b13c9
LP
498 "getxattr\0"
499 "inotify_add_watch\0"
ceaa6aa7 500 "inotify_init\0"
1a1b13c9
LP
501 "inotify_init1\0"
502 "inotify_rm_watch\0"
503 "lgetxattr\0"
504 "link\0"
505 "linkat\0"
506 "listxattr\0"
507 "llistxattr\0"
508 "lremovexattr\0"
509 "lsetxattr\0"
1a1b13c9 510 "lstat\0"
ceaa6aa7 511 "lstat64\0"
1a1b13c9
LP
512 "mkdir\0"
513 "mkdirat\0"
514 "mknod\0"
515 "mknodat\0"
1a1b13c9 516 "newfstatat\0"
ceaa6aa7
LP
517 "oldfstat\0"
518 "oldlstat\0"
519 "oldstat\0"
1a1b13c9
LP
520 "open\0"
521 "openat\0"
8270e3d8 522 "openat2\0"
1a1b13c9
LP
523 "readlink\0"
524 "readlinkat\0"
525 "removexattr\0"
526 "rename\0"
1a1b13c9 527 "renameat\0"
ceaa6aa7 528 "renameat2\0"
1a1b13c9
LP
529 "rmdir\0"
530 "setxattr\0"
1a1b13c9 531 "stat\0"
ceaa6aa7 532 "stat64\0"
1a1b13c9 533 "statfs\0"
ceaa6aa7 534 "statfs64\0"
a4135a74 535 "statx\0"
1a1b13c9
LP
536 "symlink\0"
537 "symlinkat\0"
1a1b13c9 538 "truncate\0"
ceaa6aa7 539 "truncate64\0"
1a1b13c9
LP
540 "unlink\0"
541 "unlinkat\0"
ceaa6aa7 542 "utime\0"
1a1b13c9 543 "utimensat\0"
6ca67710 544 "utimensat_time64\0"
1a1b13c9
LP
545 "utimes\0"
546 },
8130926d 547 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 548 .name = "@io-event",
d5efc18b 549 .help = "Event loop system calls",
201c1cc2
TM
550 .value =
551 "_newselect\0"
201c1cc2 552 "epoll_create\0"
215728ff 553 "epoll_create1\0"
201c1cc2
TM
554 "epoll_ctl\0"
555 "epoll_ctl_old\0"
556 "epoll_pwait\0"
34254e59 557 "epoll_pwait2\0"
201c1cc2
TM
558 "epoll_wait\0"
559 "epoll_wait_old\0"
201c1cc2 560 "eventfd\0"
215728ff 561 "eventfd2\0"
201c1cc2
TM
562 "poll\0"
563 "ppoll\0"
6ca67710 564 "ppoll_time64\0"
201c1cc2 565 "pselect6\0"
6ca67710 566 "pselect6_time64\0"
201c1cc2 567 "select\0"
8130926d
LP
568 },
569 [SYSCALL_FILTER_SET_IPC] = {
8130926d 570 .name = "@ipc",
d5efc18b
ZJS
571 .help = "SysV IPC, POSIX Message Queues or other IPC",
572 .value =
573 "ipc\0"
cd5bfd7e 574 "memfd_create\0"
201c1cc2
TM
575 "mq_getsetattr\0"
576 "mq_notify\0"
577 "mq_open\0"
578 "mq_timedreceive\0"
6ca67710 579 "mq_timedreceive_time64\0"
201c1cc2 580 "mq_timedsend\0"
6ca67710 581 "mq_timedsend_time64\0"
201c1cc2
TM
582 "mq_unlink\0"
583 "msgctl\0"
584 "msgget\0"
585 "msgrcv\0"
586 "msgsnd\0"
cd5bfd7e 587 "pipe\0"
215728ff 588 "pipe2\0"
34254e59 589 "process_madvise\0"
201c1cc2
TM
590 "process_vm_readv\0"
591 "process_vm_writev\0"
592 "semctl\0"
593 "semget\0"
594 "semop\0"
595 "semtimedop\0"
6ca67710 596 "semtimedop_time64\0"
201c1cc2
TM
597 "shmat\0"
598 "shmctl\0"
599 "shmdt\0"
600 "shmget\0"
8130926d
LP
601 },
602 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 603 .name = "@keyring",
d5efc18b 604 .help = "Kernel keyring access",
1f9ac68b
LP
605 .value =
606 "add_key\0"
607 "keyctl\0"
608 "request_key\0"
8130926d 609 },
cd0ddf6f
LP
610 [SYSCALL_FILTER_SET_MEMLOCK] = {
611 .name = "@memlock",
612 .help = "Memory locking control",
613 .value =
614 "mlock\0"
615 "mlock2\0"
616 "mlockall\0"
617 "munlock\0"
618 "munlockall\0"
619 },
8130926d 620 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 621 .name = "@module",
d5efc18b 622 .help = "Loading and unloading of kernel modules",
201c1cc2 623 .value =
201c1cc2
TM
624 "delete_module\0"
625 "finit_module\0"
626 "init_module\0"
8130926d
LP
627 },
628 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 629 .name = "@mount",
d5efc18b 630 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
631 .value =
632 "chroot\0"
9e486265
LP
633 "fsconfig\0"
634 "fsmount\0"
635 "fsopen\0"
636 "fspick\0"
201c1cc2 637 "mount\0"
34254e59 638 "mount_setattr\0"
9e486265
LP
639 "move_mount\0"
640 "open_tree\0"
201c1cc2 641 "pivot_root\0"
201c1cc2 642 "umount\0"
215728ff 643 "umount2\0"
8130926d
LP
644 },
645 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 646 .name = "@network-io",
d5efc18b 647 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 648 .value =
201c1cc2 649 "accept\0"
215728ff 650 "accept4\0"
201c1cc2
TM
651 "bind\0"
652 "connect\0"
653 "getpeername\0"
654 "getsockname\0"
655 "getsockopt\0"
656 "listen\0"
657 "recv\0"
658 "recvfrom\0"
659 "recvmmsg\0"
6ca67710 660 "recvmmsg_time64\0"
201c1cc2
TM
661 "recvmsg\0"
662 "send\0"
663 "sendmmsg\0"
664 "sendmsg\0"
665 "sendto\0"
666 "setsockopt\0"
667 "shutdown\0"
668 "socket\0"
669 "socketcall\0"
670 "socketpair\0"
8130926d
LP
671 },
672 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 673 /* some unknown even to libseccomp */
8130926d 674 .name = "@obsolete",
d5efc18b 675 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
676 .value =
677 "_sysctl\0"
678 "afs_syscall\0"
802fa07a 679 "bdflush\0"
201c1cc2 680 "break\0"
1f9ac68b 681 "create_module\0"
201c1cc2
TM
682 "ftime\0"
683 "get_kernel_syms\0"
201c1cc2
TM
684 "getpmsg\0"
685 "gtty\0"
7e0c3b8f 686 "idle\0"
201c1cc2 687 "lock\0"
201c1cc2 688 "mpx\0"
201c1cc2
TM
689 "prof\0"
690 "profil\0"
201c1cc2
TM
691 "putpmsg\0"
692 "query_module\0"
201c1cc2
TM
693 "security\0"
694 "sgetmask\0"
695 "ssetmask\0"
ae5e9bf4 696 "stime\0"
201c1cc2 697 "stty\0"
1f9ac68b 698 "sysfs\0"
201c1cc2
TM
699 "tuxcall\0"
700 "ulimit\0"
701 "uselib\0"
1f9ac68b 702 "ustat\0"
201c1cc2 703 "vserver\0"
8130926d 704 },
9493b168
ZJS
705 [SYSCALL_FILTER_SET_PKEY] = {
706 .name = "@pkey",
707 .help = "System calls used for memory protection keys",
708 .value =
709 "pkey_alloc\0"
710 "pkey_free\0"
711 "pkey_mprotect\0"
712 },
8130926d 713 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 714 .name = "@privileged",
d5efc18b 715 .help = "All system calls which need super-user capabilities",
201c1cc2 716 .value =
44898c53 717 "@chown\0"
201c1cc2
TM
718 "@clock\0"
719 "@module\0"
720 "@raw-io\0"
af0f047b
LP
721 "@reboot\0"
722 "@swap\0"
215728ff 723 "_sysctl\0"
201c1cc2 724 "acct\0"
201c1cc2 725 "bpf\0"
1f9ac68b 726 "capset\0"
201c1cc2 727 "chroot\0"
a05cfe23 728 "fanotify_init\0"
9e486265 729 "fanotify_mark\0"
201c1cc2 730 "nfsservctl\0"
a05cfe23 731 "open_by_handle_at\0"
201c1cc2
TM
732 "pivot_root\0"
733 "quotactl\0"
76e86b8d 734 "quotactl_fd\0"
201c1cc2 735 "setdomainname\0"
201c1cc2 736 "setfsuid\0"
215728ff 737 "setfsuid32\0"
201c1cc2 738 "setgroups\0"
215728ff 739 "setgroups32\0"
201c1cc2 740 "sethostname\0"
201c1cc2 741 "setresuid\0"
215728ff 742 "setresuid32\0"
201c1cc2 743 "setreuid\0"
215728ff 744 "setreuid32\0"
e05ee49b 745 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 746 "setuid32\0"
201c1cc2 747 "vhangup\0"
8130926d
LP
748 },
749 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 750 .name = "@process",
7b121df6 751 .help = "Process control, execution, namespacing operations",
201c1cc2 752 .value =
09d3020b 753 "capget\0" /* Able to query arbitrary processes */
201c1cc2 754 "clone\0"
c5503601
ZJS
755 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
756 * implement seccomp, so we don't need to list it at all. C.f.
757 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
9e486265 758 "clone3\0"
201c1cc2
TM
759 "execveat\0"
760 "fork\0"
b887d2eb 761 "getrusage\0"
201c1cc2 762 "kill\0"
9e486265 763 "pidfd_open\0"
46fcf95d 764 "pidfd_send_signal\0"
201c1cc2 765 "prctl\0"
b887d2eb
LP
766 "rt_sigqueueinfo\0"
767 "rt_tgsigqueueinfo\0"
201c1cc2 768 "setns\0"
a9518dc3 769 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 770 "tgkill\0"
b887d2eb 771 "times\0"
201c1cc2
TM
772 "tkill\0"
773 "unshare\0"
774 "vfork\0"
b887d2eb
LP
775 "wait4\0"
776 "waitid\0"
777 "waitpid\0"
8130926d
LP
778 },
779 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 780 .name = "@raw-io",
d5efc18b 781 .help = "Raw I/O port access",
201c1cc2
TM
782 .value =
783 "ioperm\0"
784 "iopl\0"
1f9ac68b 785 "pciconfig_iobase\0"
201c1cc2
TM
786 "pciconfig_read\0"
787 "pciconfig_write\0"
788 "s390_pci_mmio_read\0"
789 "s390_pci_mmio_write\0"
8130926d 790 },
bd2ab3f4
LP
791 [SYSCALL_FILTER_SET_REBOOT] = {
792 .name = "@reboot",
793 .help = "Reboot and reboot preparation/kexec",
794 .value =
bd2ab3f4 795 "kexec_file_load\0"
e59608fa 796 "kexec_load\0"
bd2ab3f4
LP
797 "reboot\0"
798 },
133ddbbe 799 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 800 .name = "@resources",
58a8f68b 801 .help = "Alter resource settings",
133ddbbe 802 .value =
0963c053
LP
803 "ioprio_set\0"
804 "mbind\0"
805 "migrate_pages\0"
806 "move_pages\0"
807 "nice\0"
0963c053
LP
808 "sched_setaffinity\0"
809 "sched_setattr\0"
133ddbbe
LP
810 "sched_setparam\0"
811 "sched_setscheduler\0"
0963c053 812 "set_mempolicy\0"
76e86b8d 813 "set_mempolicy_home_node\0"
133ddbbe
LP
814 "setpriority\0"
815 "setrlimit\0"
133ddbbe 816 },
d12632a8
LP
817 [SYSCALL_FILTER_SET_SANDBOX] = {
818 .name = "@sandbox",
819 .help = "Sandbox functionality",
820 .value =
821 "landlock_add_rule\0"
822 "landlock_create_ruleset\0"
823 "landlock_restrict_self\0"
824 "seccomp\0"
825 },
6eaaeee9
LP
826 [SYSCALL_FILTER_SET_SETUID] = {
827 .name = "@setuid",
828 .help = "Operations for changing user/group credentials",
829 .value =
6eaaeee9 830 "setgid\0"
215728ff 831 "setgid32\0"
6eaaeee9 832 "setgroups\0"
215728ff 833 "setgroups32\0"
6eaaeee9 834 "setregid\0"
215728ff 835 "setregid32\0"
6eaaeee9 836 "setresgid\0"
215728ff 837 "setresgid32\0"
6eaaeee9 838 "setresuid\0"
215728ff 839 "setresuid32\0"
6eaaeee9 840 "setreuid\0"
215728ff 841 "setreuid32\0"
6eaaeee9 842 "setuid\0"
215728ff 843 "setuid32\0"
6eaaeee9 844 },
cd0ddf6f
LP
845 [SYSCALL_FILTER_SET_SIGNAL] = {
846 .name = "@signal",
847 .help = "Process signal handling",
848 .value =
849 "rt_sigaction\0"
850 "rt_sigpending\0"
851 "rt_sigprocmask\0"
852 "rt_sigsuspend\0"
853 "rt_sigtimedwait\0"
6ca67710 854 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
855 "sigaction\0"
856 "sigaltstack\0"
857 "signal\0"
858 "signalfd\0"
859 "signalfd4\0"
860 "sigpending\0"
861 "sigprocmask\0"
862 "sigsuspend\0"
863 },
bd2ab3f4
LP
864 [SYSCALL_FILTER_SET_SWAP] = {
865 .name = "@swap",
866 .help = "Enable/disable swap devices",
867 .value =
868 "swapoff\0"
869 "swapon\0"
870 },
44898c53
LP
871 [SYSCALL_FILTER_SET_SYNC] = {
872 .name = "@sync",
873 .help = "Synchronize files and memory to storage",
874 .value =
875 "fdatasync\0"
876 "fsync\0"
877 "msync\0"
878 "sync\0"
879 "sync_file_range\0"
a8fb09f5 880 "sync_file_range2\0"
44898c53
LP
881 "syncfs\0"
882 },
70526841
LP
883 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
884 .name = "@system-service",
885 .help = "General system service operations",
886 .value =
887 "@aio\0"
888 "@basic-io\0"
889 "@chown\0"
890 "@default\0"
891 "@file-system\0"
892 "@io-event\0"
893 "@ipc\0"
894 "@keyring\0"
895 "@memlock\0"
896 "@network-io\0"
897 "@process\0"
898 "@resources\0"
899 "@setuid\0"
900 "@signal\0"
901 "@sync\0"
902 "@timer\0"
26b682e8 903 "arm_fadvise64_64\0"
70526841
LP
904 "capget\0"
905 "capset\0"
906 "copy_file_range\0"
907 "fadvise64\0"
908 "fadvise64_64\0"
909 "flock\0"
910 "get_mempolicy\0"
911 "getcpu\0"
912 "getpriority\0"
70526841
LP
913 "ioctl\0"
914 "ioprio_get\0"
915 "kcmp\0"
916 "madvise\0"
70526841
LP
917 "mremap\0"
918 "name_to_handle_at\0"
919 "oldolduname\0"
920 "olduname\0"
921 "personality\0"
922 "readahead\0"
923 "readdir\0"
924 "remap_file_pages\0"
925 "sched_get_priority_max\0"
926 "sched_get_priority_min\0"
70526841
LP
927 "sched_getattr\0"
928 "sched_getparam\0"
929 "sched_getscheduler\0"
930 "sched_rr_get_interval\0"
6ca67710 931 "sched_rr_get_interval_time64\0"
70526841
LP
932 "sched_yield\0"
933 "sendfile\0"
934 "sendfile64\0"
935 "setfsgid\0"
936 "setfsgid32\0"
937 "setfsuid\0"
938 "setfsuid32\0"
939 "setpgid\0"
940 "setsid\0"
941 "splice\0"
942 "sysinfo\0"
943 "tee\0"
944 "umask\0"
945 "uname\0"
946 "userfaultfd\0"
947 "vmsplice\0"
948 },
cd0ddf6f
LP
949 [SYSCALL_FILTER_SET_TIMER] = {
950 .name = "@timer",
951 .help = "Schedule operations by time",
952 .value =
953 "alarm\0"
954 "getitimer\0"
955 "setitimer\0"
956 "timer_create\0"
957 "timer_delete\0"
958 "timer_getoverrun\0"
959 "timer_gettime\0"
6ca67710 960 "timer_gettime64\0"
cd0ddf6f 961 "timer_settime\0"
6ca67710 962 "timer_settime64\0"
cd0ddf6f
LP
963 "timerfd_create\0"
964 "timerfd_gettime\0"
6ca67710 965 "timerfd_gettime64\0"
cd0ddf6f 966 "timerfd_settime\0"
6ca67710 967 "timerfd_settime64\0"
cd0ddf6f
LP
968 "times\0"
969 },
95aac012
ZJS
970 [SYSCALL_FILTER_SET_KNOWN] = {
971 .name = "@known",
972 .help = "All known syscalls declared in the kernel",
973 .value =
6d6a0854 974 "@obsolete\0"
95aac012
ZJS
975#include "syscall-list.h"
976 },
201c1cc2 977};
8130926d
LP
978
979const SyscallFilterSet *syscall_filter_set_find(const char *name) {
8130926d
LP
980 if (isempty(name) || name[0] != '@')
981 return NULL;
982
077e8fc0 983 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
8130926d
LP
984 if (streq(syscall_filter_sets[i].name, name))
985 return syscall_filter_sets + i;
986
987 return NULL;
988}
989
000c0520
ZJS
990static int add_syscall_filter_set(
991 scmp_filter_ctx seccomp,
992 const SyscallFilterSet *set,
993 uint32_t action,
994 char **exclude,
995 bool log_missing,
996 char ***added);
997
998int seccomp_add_syscall_filter_item(
999 scmp_filter_ctx *seccomp,
1000 const char *name,
1001 uint32_t action,
1002 char **exclude,
1003 bool log_missing,
1004 char ***added) {
69b1b241
LP
1005
1006 assert(seccomp);
1007 assert(name);
1008
960e4569
LP
1009 if (strv_contains(exclude, name))
1010 return 0;
1011
000c0520
ZJS
1012 /* Any syscalls that are handled are added to the *added strv. The pointer
1013 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
1014
69b1b241
LP
1015 if (name[0] == '@') {
1016 const SyscallFilterSet *other;
1017
1018 other = syscall_filter_set_find(name);
baaa35ad
ZJS
1019 if (!other)
1020 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
1021 "Filter set %s is not known!",
1022 name);
69b1b241 1023
000c0520 1024 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
b54f36c6 1025
69b1b241 1026 } else {
b54f36c6 1027 int id, r;
69b1b241
LP
1028
1029 id = seccomp_syscall_resolve_name(name);
cff7bff8 1030 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
1031 if (log_missing)
1032 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 1033 return 0;
cff7bff8 1034 }
69b1b241
LP
1035
1036 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 1037 if (r < 0) {
69b1b241 1038 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
1039 bool ignore = r == -EDOM;
1040
1041 if (!ignore || log_missing)
1042 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1043 name, id, ignore ? ", ignoring" : "");
1044 if (!ignore)
1045 return r;
b54f36c6 1046 }
69b1b241 1047
000c0520
ZJS
1048 if (added) {
1049 r = strv_extend(added, name);
1050 if (r < 0)
1051 return r;
1052 }
1053
b54f36c6
ZJS
1054 return 0;
1055 }
69b1b241
LP
1056}
1057
000c0520 1058static int add_syscall_filter_set(
469830d1 1059 scmp_filter_ctx seccomp,
469830d1 1060 const SyscallFilterSet *set,
960e4569 1061 uint32_t action,
b54f36c6 1062 char **exclude,
000c0520
ZJS
1063 bool log_missing,
1064 char ***added) {
469830d1 1065
8130926d
LP
1066 int r;
1067
000c0520
ZJS
1068 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1069
8130926d
LP
1070 assert(seccomp);
1071 assert(set);
1072
1073 NULSTR_FOREACH(sys, set->value) {
000c0520 1074 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
69b1b241
LP
1075 if (r < 0)
1076 return r;
469830d1
LP
1077 }
1078
1079 return 0;
1080}
1081
b54f36c6 1082int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
1083 uint32_t arch;
1084 int r;
1085
1086 assert(set);
1087
1088 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 1089 * each local arch. */
469830d1
LP
1090
1091 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1092 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1093
30868c1c 1094 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1095
1096 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
1097 if (r < 0)
1098 return r;
469830d1 1099
000c0520 1100 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
7e86bd73
ZJS
1101 if (r < 0)
1102 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
1103
1104 r = seccomp_load(seccomp);
3c098014
ZJS
1105 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1106 return r;
1107 if (r < 0)
1108 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m",
1109 seccomp_arch_to_string(arch));
8130926d
LP
1110 }
1111
1112 return 0;
1113}
a3be2849 1114
1862b310 1115int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
469830d1 1116 uint32_t arch;
a3be2849
LP
1117 int r;
1118
1862b310
YW
1119 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1120 * of a SyscallFilterSet* table. */
a3be2849 1121
1862b310 1122 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
469830d1 1123 return 0;
a3be2849 1124
469830d1
LP
1125 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1126 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b54f36c6 1127 void *syscall_id, *val;
a3be2849 1128
30868c1c 1129 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1130
469830d1
LP
1131 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1132 if (r < 0)
1133 return r;
a3be2849 1134
1862b310 1135 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
8cfa775f 1136 uint32_t a = action;
b54f36c6
ZJS
1137 int id = PTR_TO_INT(syscall_id) - 1;
1138 int error = PTR_TO_INT(val);
8cfa775f 1139
005bfaf1
TM
1140 if (error == SECCOMP_ERROR_NUMBER_KILL)
1141 a = scmp_act_kill_process();
9df2cdd8
TM
1142#ifdef SCMP_ACT_LOG
1143 else if (action == SCMP_ACT_LOG)
1144 a = SCMP_ACT_LOG;
1145#endif
68acc1af 1146 else if (error >= 0)
b54f36c6 1147 a = SCMP_ACT_ERRNO(error);
8cfa775f 1148
b54f36c6 1149 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1 1150 if (r < 0) {
1862b310
YW
1151 /* If the system call is not known on this architecture, then that's
1152 * fine, let's ignore it */
469830d1 1153 _cleanup_free_ char *n = NULL;
7e86bd73 1154 bool ignore;
469830d1 1155
b54f36c6 1156 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1157 ignore = r == -EDOM;
1158 if (!ignore || log_missing)
1159 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1160 strna(n), id, ignore ? ", ignoring" : "");
1161 if (!ignore)
1162 return r;
469830d1
LP
1163 }
1164 }
1165
1166 r = seccomp_load(seccomp);
3c098014
ZJS
1167 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1168 return r;
1169 if (r < 0)
a52765a5 1170 log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
1862b310 1171 seccomp_arch_to_string(arch));
469830d1
LP
1172 }
1173
1174 return 0;
add00535
LP
1175}
1176
58f6ab44 1177int seccomp_parse_syscall_filter(
898748d8
YW
1178 const char *name,
1179 int errno_num,
1180 Hashmap *filter,
13d92c63 1181 SeccompParseFlags flags,
898748d8
YW
1182 const char *unit,
1183 const char *filename,
1184 unsigned line) {
1185
1186 int r;
1187
1188 assert(name);
1189 assert(filter);
1190
084a46d7
YW
1191 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1192 return -EINVAL;
1193
898748d8
YW
1194 if (name[0] == '@') {
1195 const SyscallFilterSet *set;
898748d8
YW
1196
1197 set = syscall_filter_set_find(name);
1198 if (!set) {
9e29ee40 1199 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1200 return -EINVAL;
13d92c63 1201
9e29ee40 1202 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
13d92c63
LP
1203 "Unknown system call group, ignoring: %s", name);
1204 return 0;
898748d8
YW
1205 }
1206
1207 NULSTR_FOREACH(i, set->value) {
3c098014
ZJS
1208 /* Call ourselves again, for the group to parse. Note that we downgrade logging here
1209 * (i.e. take away the SECCOMP_PARSE_LOG flag) since any issues in the group table
1210 * are our own problem, not a problem in user configuration data and we shouldn't
1211 * pretend otherwise by complaining about them. */
58f6ab44 1212 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1213 if (r < 0)
1214 return r;
1215 }
1216 } else {
1217 int id;
1218
1219 id = seccomp_syscall_resolve_name(name);
1220 if (id == __NR_SCMP_ERROR) {
9e29ee40 1221 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1222 return -EINVAL;
13d92c63 1223
9e29ee40 1224 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
13d92c63
LP
1225 "Failed to parse system call, ignoring: %s", name);
1226 return 0;
898748d8
YW
1227 }
1228
3c098014
ZJS
1229 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove it
1230 * from the list. The entries in allow-list with non-negative error value will be handled
1231 * with SCMP_ACT_ERRNO() instead of the default action. */
68acc1af
YW
1232 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1233 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
898748d8
YW
1234 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1235 if (r < 0)
851ee70a
LW
1236 switch (r) {
1237 case -ENOMEM:
9e29ee40 1238 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
851ee70a 1239 case -EEXIST:
9d7fe7c6
LW
1240 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1241 break;
851ee70a
LW
1242 default:
1243 return r;
1244 }
898748d8
YW
1245 } else
1246 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1247 }
1248
1249 return 0;
1250}
1251
add00535 1252int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1253 uint32_t arch;
add00535
LP
1254 int r;
1255
f1d34068 1256 if (DEBUG_LOGGING) {
add00535
LP
1257 _cleanup_free_ char *s = NULL;
1258
86c2a9f1 1259 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1260 log_debug("Restricting namespace to: %s.", strna(s));
1261 }
1262
1263 /* NOOP? */
d7a0f1f4 1264 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
add00535
LP
1265 return 0;
1266
469830d1
LP
1267 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1268 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
add00535 1269
30868c1c 1270 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1271
1272 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1273 if (r < 0)
1274 return r;
1275
30193fe8
ZJS
1276 /* We cannot filter on individual flags to clone3(), and we need to disable the
1277 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1278 * users shall fall back to clone(), as if on an older kernel.
1279 *
1280 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1281 * https://github.com/moby/moby/issues/42680. */
1282
1283 r = seccomp_rule_add_exact(
1284 seccomp,
1285 SCMP_ACT_ERRNO(ENOSYS),
1286 SCMP_SYS(clone3),
1287 0);
1288 if (r < 0)
3c098014
ZJS
1289 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m",
1290 seccomp_arch_to_string(arch));
30193fe8 1291
469830d1 1292 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
3c098014
ZJS
1293 /* If every single kind of namespace shall be prohibited, then let's block the whole
1294 * setns() syscall altogether. */
469830d1
LP
1295 r = seccomp_rule_add_exact(
1296 seccomp,
1297 SCMP_ACT_ERRNO(EPERM),
1298 SCMP_SYS(setns),
1299 0);
1300 else
3c098014
ZJS
1301 /* Otherwise, block only the invocations with the appropriate flags in the loop
1302 * below, but also the special invocation with a zero flags argument, right here. */
469830d1
LP
1303 r = seccomp_rule_add_exact(
1304 seccomp,
1305 SCMP_ACT_ERRNO(EPERM),
1306 SCMP_SYS(setns),
1307 1,
1308 SCMP_A1(SCMP_CMP_EQ, 0));
1309 if (r < 0) {
3c098014
ZJS
1310 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1311 seccomp_arch_to_string(arch));
469830d1
LP
1312 continue;
1313 }
1314
241b1577 1315 for (unsigned i = 0; namespace_info[i].proc_name; i++) {
469830d1
LP
1316 unsigned long f;
1317
241b1577 1318 f = namespace_info[i].clone_flag;
d7a0f1f4 1319 if (FLAGS_SET(retain, f)) {
241b1577 1320 log_debug("Permitting %s.", namespace_info[i].proc_name);
469830d1
LP
1321 continue;
1322 }
1323
30868c1c 1324 log_trace("Blocking %s.", namespace_info[i].proc_name);
469830d1
LP
1325
1326 r = seccomp_rule_add_exact(
1327 seccomp,
1328 SCMP_ACT_ERRNO(EPERM),
1329 SCMP_SYS(unshare),
1330 1,
1331 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1332 if (r < 0) {
3c098014
ZJS
1333 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m",
1334 seccomp_arch_to_string(arch));
469830d1
LP
1335 break;
1336 }
1337
511ceb1f
ZJS
1338 /* On s390/s390x the first two parameters to clone are switched */
1339 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1340 r = seccomp_rule_add_exact(
1341 seccomp,
1342 SCMP_ACT_ERRNO(EPERM),
1343 SCMP_SYS(clone),
1344 1,
1345 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1346 else
1347 r = seccomp_rule_add_exact(
1348 seccomp,
1349 SCMP_ACT_ERRNO(EPERM),
1350 SCMP_SYS(clone),
1351 1,
1352 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1 1353 if (r < 0) {
3c098014
ZJS
1354 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m",
1355 seccomp_arch_to_string(arch));
469830d1
LP
1356 break;
1357 }
1358
1359 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1360 r = seccomp_rule_add_exact(
1361 seccomp,
1362 SCMP_ACT_ERRNO(EPERM),
1363 SCMP_SYS(setns),
1364 1,
1365 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1366 if (r < 0) {
3c098014
ZJS
1367 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1368 seccomp_arch_to_string(arch));
469830d1
LP
1369 break;
1370 }
1371 }
1372 }
1373 if (r < 0)
1374 continue;
1375
1376 r = seccomp_load(seccomp);
3c098014
ZJS
1377 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1378 return r;
1379 if (r < 0)
1380 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m",
1381 seccomp_arch_to_string(arch));
469830d1
LP
1382 }
1383
1384 return 0;
1385}
1386
1387int seccomp_protect_sysctl(void) {
1388 uint32_t arch;
1389 int r;
1390
1391 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1392 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1393
30868c1c 1394 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1395
f9252236
AJ
1396 if (IN_SET(arch,
1397 SCMP_ARCH_AARCH64,
f9d3fb6b
XW
1398#ifdef SCMP_ARCH_LOONGARCH64
1399 SCMP_ARCH_LOONGARCH64,
1400#endif
f9252236
AJ
1401#ifdef SCMP_ARCH_RISCV64
1402 SCMP_ARCH_RISCV64,
1403#endif
1404 SCMP_ARCH_X32
1405 ))
2e64e8f4
ZJS
1406 /* No _sysctl syscall */
1407 continue;
1408
469830d1
LP
1409 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1410 if (r < 0)
1411 return r;
1412
1413 r = seccomp_rule_add_exact(
add00535
LP
1414 seccomp,
1415 SCMP_ACT_ERRNO(EPERM),
469830d1 1416 SCMP_SYS(_sysctl),
add00535 1417 0);
469830d1 1418 if (r < 0) {
3c098014
ZJS
1419 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m",
1420 seccomp_arch_to_string(arch));
469830d1
LP
1421 continue;
1422 }
1423
1424 r = seccomp_load(seccomp);
3c098014
ZJS
1425 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1426 return r;
1427 if (r < 0)
1428 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m",
1429 seccomp_arch_to_string(arch));
469830d1
LP
1430 }
1431
1432 return 0;
1433}
1434
620dbdd2
KK
1435int seccomp_protect_syslog(void) {
1436 uint32_t arch;
1437 int r;
1438
1439 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1440 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1441
1442 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1443 if (r < 0)
1444 return r;
1445
1446 r = seccomp_rule_add_exact(
1447 seccomp,
1448 SCMP_ACT_ERRNO(EPERM),
1449 SCMP_SYS(syslog),
1450 0);
1451
1452 if (r < 0) {
1453 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1454 continue;
1455 }
1456
1457 r = seccomp_load(seccomp);
3c098014
ZJS
1458 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1459 return r;
1460 if (r < 0)
1461 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m",
1462 seccomp_arch_to_string(arch));
620dbdd2
KK
1463 }
1464
1465 return 0;
1466}
1467
6b000af4 1468int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1469 uint32_t arch;
1470 int r;
1471
1472 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1473 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1474 bool supported;
469830d1 1475
30868c1c 1476 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1477
9606bc4b
LP
1478 switch (arch) {
1479
1480 case SCMP_ARCH_X86_64:
1481 case SCMP_ARCH_X32:
1482 case SCMP_ARCH_ARM:
1483 case SCMP_ARCH_AARCH64:
f9d3fb6b
XW
1484#ifdef SCMP_ARCH_LOONGARCH64
1485 case SCMP_ARCH_LOONGARCH64:
1486#endif
f5aeac14
JC
1487 case SCMP_ARCH_MIPSEL64N32:
1488 case SCMP_ARCH_MIPS64N32:
1489 case SCMP_ARCH_MIPSEL64:
1490 case SCMP_ARCH_MIPS64:
f9252236
AJ
1491#ifdef SCMP_ARCH_RISCV64
1492 case SCMP_ARCH_RISCV64:
1493#endif
9606bc4b
LP
1494 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1495 supported = true;
1496 break;
1497
9606bc4b
LP
1498 case SCMP_ARCH_S390:
1499 case SCMP_ARCH_S390X:
da1921a5 1500 case SCMP_ARCH_X86:
f5aeac14
JC
1501 case SCMP_ARCH_MIPSEL:
1502 case SCMP_ARCH_MIPS:
344e6b62
SJ
1503#ifdef SCMP_ARCH_PARISC
1504 case SCMP_ARCH_PARISC:
1505#endif
1506#ifdef SCMP_ARCH_PARISC64
1507 case SCMP_ARCH_PARISC64:
1508#endif
d5923e38
ZJS
1509 case SCMP_ARCH_PPC:
1510 case SCMP_ARCH_PPC64:
1511 case SCMP_ARCH_PPC64LE:
9606bc4b
LP
1512 default:
1513 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1514 * don't know */
1515 supported = false;
1516 break;
1517 }
1518
1519 if (!supported)
1520 continue;
1521
469830d1
LP
1522 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1523 if (r < 0)
1524 return r;
1525
6b000af4 1526 if (allow_list) {
077e8fc0 1527 int first = 0, last = 0;
469830d1
LP
1528 void *afp;
1529
6b000af4
LP
1530 /* If this is an allow list, we first block the address families that are out of
1531 * range and then everything that is not in the set. First, we find the lowest and
1532 * highest address family in the set. */
469830d1 1533
90e74a66 1534 SET_FOREACH(afp, address_families) {
077e8fc0 1535 int af = PTR_TO_INT(afp);
469830d1
LP
1536
1537 if (af <= 0 || af >= af_max())
1538 continue;
1539
1540 if (first == 0 || af < first)
1541 first = af;
1542
1543 if (last == 0 || af > last)
1544 last = af;
1545 }
1546
1547 assert((first == 0) == (last == 0));
1548
1549 if (first == 0) {
1550
1551 /* No entries in the valid range, block everything */
1552 r = seccomp_rule_add_exact(
1553 seccomp,
1554 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1555 SCMP_SYS(socket),
1556 0);
1557 if (r < 0) {
3c098014
ZJS
1558 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1559 seccomp_arch_to_string(arch));
469830d1
LP
1560 continue;
1561 }
1562
1563 } else {
1564
1565 /* Block everything below the first entry */
1566 r = seccomp_rule_add_exact(
1567 seccomp,
1568 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1569 SCMP_SYS(socket),
1570 1,
1571 SCMP_A0(SCMP_CMP_LT, first));
1572 if (r < 0) {
3c098014
ZJS
1573 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1574 seccomp_arch_to_string(arch));
469830d1
LP
1575 continue;
1576 }
1577
1578 /* Block everything above the last entry */
1579 r = seccomp_rule_add_exact(
1580 seccomp,
1581 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1582 SCMP_SYS(socket),
1583 1,
1584 SCMP_A0(SCMP_CMP_GT, last));
1585 if (r < 0) {
3c098014
ZJS
1586 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1587 seccomp_arch_to_string(arch));
469830d1
LP
1588 continue;
1589 }
1590
1591 /* Block everything between the first and last entry */
077e8fc0 1592 for (int af = 1; af < af_max(); af++) {
469830d1
LP
1593
1594 if (set_contains(address_families, INT_TO_PTR(af)))
1595 continue;
1596
1597 r = seccomp_rule_add_exact(
1598 seccomp,
1599 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1600 SCMP_SYS(socket),
1601 1,
1602 SCMP_A0(SCMP_CMP_EQ, af));
1603 if (r < 0)
1604 break;
1605 }
469830d1 1606 if (r < 0) {
3c098014
ZJS
1607 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1608 seccomp_arch_to_string(arch));
469830d1
LP
1609 continue;
1610 }
1611 }
1612
1613 } else {
1614 void *af;
1615
6b000af4
LP
1616 /* If this is a deny list, then generate one rule for each address family that are
1617 * then combined in OR checks. */
469830d1 1618
90e74a66 1619 SET_FOREACH(af, address_families) {
469830d1
LP
1620 r = seccomp_rule_add_exact(
1621 seccomp,
1622 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1623 SCMP_SYS(socket),
1624 1,
1625 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1626 if (r < 0)
1627 break;
1628 }
469830d1 1629 if (r < 0) {
3c098014
ZJS
1630 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1631 seccomp_arch_to_string(arch));
469830d1
LP
1632 continue;
1633 }
1634 }
1635
1636 r = seccomp_load(seccomp);
3c098014
ZJS
1637 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1638 return r;
1639 if (r < 0)
1640 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m",
1641 seccomp_arch_to_string(arch));
469830d1
LP
1642 }
1643
1644 return 0;
1645}
1646
a9002749 1647int seccomp_restrict_realtime_full(int error_code) {
469830d1
LP
1648 static const int permitted_policies[] = {
1649 SCHED_OTHER,
1650 SCHED_BATCH,
1651 SCHED_IDLE,
1652 };
1653
1654 int r, max_policy = 0;
1655 uint32_t arch;
1656 unsigned i;
1657
a9002749
YW
1658 assert(error_code > 0);
1659
469830d1
LP
1660 /* Determine the highest policy constant we want to allow */
1661 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1662 if (permitted_policies[i] > max_policy)
1663 max_policy = permitted_policies[i];
1664
1665 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1666 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1667 int p;
1668
30868c1c 1669 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1670
1671 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1672 if (r < 0)
1673 return r;
1674
1675 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1676 * allow list. */
469830d1
LP
1677 for (p = 0; p < max_policy; p++) {
1678 bool good = false;
1679
6b000af4 1680 /* Check if this is in the allow list. */
469830d1
LP
1681 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1682 if (permitted_policies[i] == p) {
1683 good = true;
1684 break;
1685 }
1686
1687 if (good)
1688 continue;
1689
1690 /* Deny this policy */
1691 r = seccomp_rule_add_exact(
1692 seccomp,
a9002749 1693 SCMP_ACT_ERRNO(error_code),
469830d1
LP
1694 SCMP_SYS(sched_setscheduler),
1695 1,
1696 SCMP_A1(SCMP_CMP_EQ, p));
1697 if (r < 0) {
3c098014
ZJS
1698 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1699 seccomp_arch_to_string(arch));
469830d1
LP
1700 continue;
1701 }
1702 }
1703
6b000af4
LP
1704 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1705 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1706 r = seccomp_rule_add_exact(
add00535 1707 seccomp,
a9002749 1708 SCMP_ACT_ERRNO(error_code),
469830d1 1709 SCMP_SYS(sched_setscheduler),
add00535 1710 1,
469830d1
LP
1711 SCMP_A1(SCMP_CMP_GT, max_policy));
1712 if (r < 0) {
3c098014
ZJS
1713 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1714 seccomp_arch_to_string(arch));
469830d1
LP
1715 continue;
1716 }
add00535 1717
469830d1 1718 r = seccomp_load(seccomp);
3c098014
ZJS
1719 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1720 return r;
1721 if (r < 0)
1722 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m",
1723 seccomp_arch_to_string(arch));
469830d1
LP
1724 }
1725
1726 return 0;
1727}
1728
6dc66688
ZJS
1729static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1730 uint32_t arch,
1731 int nr,
14cb109d 1732 unsigned arg_cnt,
6dc66688
ZJS
1733 const struct scmp_arg_cmp arg) {
1734 int r;
1735
1736 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1737 if (r < 0) {
1738 _cleanup_free_ char *n = NULL;
1739
1740 n = seccomp_syscall_resolve_num_arch(arch, nr);
1741 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1742 strna(n),
1743 seccomp_arch_to_string(arch));
1744 }
1745
1746 return r;
1747}
1748
2a8d6e63 1749/* For known architectures, check that syscalls are indeed defined or not. */
f9d3fb6b 1750#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64)
2a8d6e63
ZJS
1751assert_cc(SCMP_SYS(shmget) > 0);
1752assert_cc(SCMP_SYS(shmat) > 0);
1753assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1754#endif
6dc66688 1755
469830d1
LP
1756int seccomp_memory_deny_write_execute(void) {
1757 uint32_t arch;
b069c2a3 1758 unsigned loaded = 0;
469830d1
LP
1759
1760 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1761 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1762 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1763
30868c1c 1764 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1765
8a50cf69
LP
1766 switch (arch) {
1767
bed4668d
CE
1768 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1769 * We ignore that here, which means there's still a way to get writable/executable
344e6b62
SJ
1770 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1771 *
1772 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1773 * on that front (kernel work done in 5.18).
1774 */
bed4668d 1775
8a50cf69 1776 case SCMP_ARCH_X86:
57311925 1777 case SCMP_ARCH_S390:
8a50cf69
LP
1778 filter_syscall = SCMP_SYS(mmap2);
1779 block_syscall = SCMP_SYS(mmap);
bed4668d 1780 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1781 break;
1782
63d00dfb 1783 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1784 case SCMP_ARCH_PPC64:
1785 case SCMP_ARCH_PPC64LE:
bed4668d 1786 case SCMP_ARCH_S390X:
2a8d6e63 1787 filter_syscall = SCMP_SYS(mmap);
bed4668d 1788 /* shmat multiplexed, see above */
8a50cf69
LP
1789 break;
1790
4278d1f5
ZJS
1791 case SCMP_ARCH_ARM:
1792 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1793 shmat_syscall = SCMP_SYS(shmat);
1794 break;
1795
8a50cf69
LP
1796 case SCMP_ARCH_X86_64:
1797 case SCMP_ARCH_X32:
79873bc8 1798 case SCMP_ARCH_AARCH64:
f9d3fb6b
XW
1799#ifdef SCMP_ARCH_LOONGARCH64
1800 case SCMP_ARCH_LOONGARCH64:
1801#endif
f9252236
AJ
1802#ifdef SCMP_ARCH_RISCV64
1803 case SCMP_ARCH_RISCV64:
1804#endif
f9d3fb6b 1805 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */
8a50cf69
LP
1806 shmat_syscall = SCMP_SYS(shmat);
1807 break;
1808
1809 /* Please add more definitions here, if you port systemd to other architectures! */
1810
f9d3fb6b 1811#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64)
8a50cf69
LP
1812#warning "Consider adding the right mmap() syscall definitions here!"
1813#endif
1814 }
1815
1816 /* Can't filter mmap() on this arch, then skip it */
1817 if (filter_syscall == 0)
1818 continue;
1819
469830d1
LP
1820 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1821 if (r < 0)
1822 return r;
1823
6dc66688
ZJS
1824 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1825 1,
1826 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1827 if (r < 0)
1828 continue;
8a50cf69
LP
1829
1830 if (block_syscall != 0) {
6dc66688
ZJS
1831 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1832 if (r < 0)
8a50cf69 1833 continue;
add00535 1834 }
a3be2849 1835
6dc66688
ZJS
1836 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1837 1,
b835eeb4
ZJS
1838 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1839 if (r < 0)
1840 continue;
1841
1842 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1843 1,
6dc66688
ZJS
1844 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1845 if (r < 0)
469830d1 1846 continue;
add00535 1847
67fb5f33 1848 if (shmat_syscall > 0) {
5ef3ed97 1849 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1850 1,
1851 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1852 if (r < 0)
8a50cf69 1853 continue;
469830d1
LP
1854 }
1855
1856 r = seccomp_load(seccomp);
3c098014
ZJS
1857 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1858 return r;
1859 if (r < 0)
b069c2a3
ZJS
1860 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1861 seccomp_arch_to_string(arch));
903659e7 1862 loaded++;
469830d1 1863 }
add00535 1864
903659e7 1865 if (loaded == 0)
b069c2a3 1866 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1867
1868 return loaded;
469830d1
LP
1869}
1870
1871int seccomp_restrict_archs(Set *archs) {
1872 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
469830d1 1873 int r;
65976868 1874 bool blocked_new = false;
469830d1
LP
1875
1876 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1877 * list.
1878 *
1879 * There are some qualifications. However the most important use is to stop processes from bypassing
1880 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1881 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1882
2428aaf8
AJ
1883 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1884 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1885 * to run a program with the restrictions applied. */
469830d1
LP
1886 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1887 if (!seccomp)
1888 return -ENOMEM;
1889
65976868
GDF
1890 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1891 uint32_t arch = seccomp_local_archs[i];
2428aaf8 1892
f833df38
BB
1893 /* See above comment, our "native" architecture is never blocked. */
1894 if (arch == seccomp_arch_native())
1895 continue;
1896
65976868
GDF
1897 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1898 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1899 continue;
2428aaf8 1900
65976868 1901 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
2428aaf8 1902
65976868
GDF
1903 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1904 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1905 * The important thing is that you can block the old 32-bit x86 syscalls.
1906 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1907 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1908 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1909
1910 if (block) {
1911 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1912 blocked_new = true;
1913 } else {
1914 r = seccomp_arch_add(seccomp, arch);
1915 if (r < 0 && r != -EEXIST)
1916 return r;
1917 }
add00535
LP
1918 }
1919
65976868
GDF
1920 /* All architectures that will be blocked by the seccomp program were
1921 * already blocked. */
1922 if (!blocked_new)
1923 return 0;
1924
469830d1
LP
1925 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1926 if (r < 0)
1927 return r;
add00535 1928
1c6af69b 1929 r = seccomp_load(seccomp);
3c098014
ZJS
1930 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1931 return r;
1932 if (r < 0)
1c6af69b
LP
1933 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1934
1935 return 0;
a3be2849 1936}
b16bd535 1937
de7fef4b
ZJS
1938int parse_syscall_archs(char **l, Set **ret_archs) {
1939 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
1940 int r;
1941
1942 assert(l);
de7fef4b 1943 assert(ret_archs);
b16bd535
YW
1944
1945 STRV_FOREACH(s, l) {
1946 uint32_t a;
1947
1948 r = seccomp_arch_from_string(*s, &a);
1949 if (r < 0)
1950 return -EINVAL;
1951
de7fef4b 1952 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
1953 if (r < 0)
1954 return -ENOMEM;
1955 }
1956
de7fef4b 1957 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
1958 return 0;
1959}
165a31c0 1960
8cfa775f 1961int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1962 int r;
1963
1964 assert(set);
1965
1966 NULSTR_FOREACH(i, set->value) {
1967
1968 if (i[0] == '@') {
1969 const SyscallFilterSet *more;
1970
1971 more = syscall_filter_set_find(i);
1972 if (!more)
1973 return -ENXIO;
1974
165a31c0
LP
1975 r = seccomp_filter_set_add(filter, add, more);
1976 if (r < 0)
1977 return r;
1978 } else {
1979 int id;
1980
1981 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1982 if (id == __NR_SCMP_ERROR) {
1983 log_debug("Couldn't resolve system call, ignoring: %s", i);
1984 continue;
1985 }
165a31c0
LP
1986
1987 if (add) {
8cfa775f 1988 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1989 if (r < 0)
1990 return r;
1991 } else
8cfa775f 1992 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1993 }
1994 }
1995
1996 return 0;
1997}
78e864e5
TM
1998
1999int seccomp_lock_personality(unsigned long personality) {
72eafe71 2000 uint32_t arch;
78e864e5
TM
2001 int r;
2002
72eafe71
LP
2003 if (personality >= PERSONALITY_INVALID)
2004 return -EINVAL;
78e864e5 2005
72eafe71
LP
2006 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2007 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 2008
72eafe71
LP
2009 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2010 if (r < 0)
2011 return r;
2012
2013 r = seccomp_rule_add_exact(
2014 seccomp,
2015 SCMP_ACT_ERRNO(EPERM),
2016 SCMP_SYS(personality),
2017 1,
2018 SCMP_A0(SCMP_CMP_NE, personality));
448ac526 2019 if (r < 0) {
3c098014
ZJS
2020 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
2021 seccomp_arch_to_string(arch));
448ac526
LP
2022 continue;
2023 }
72eafe71
LP
2024
2025 r = seccomp_load(seccomp);
3c098014
ZJS
2026 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2027 return r;
2028 if (r < 0)
2029 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m",
2030 seccomp_arch_to_string(arch));
72eafe71
LP
2031 }
2032
2033 return 0;
78e864e5 2034}
aecd5ac6
TM
2035
2036int seccomp_protect_hostname(void) {
2037 uint32_t arch;
2038 int r;
2039
2040 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2041 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2042
2043 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2044 if (r < 0)
2045 return r;
2046
2047 r = seccomp_rule_add_exact(
2048 seccomp,
2049 SCMP_ACT_ERRNO(EPERM),
2050 SCMP_SYS(sethostname),
2051 0);
9e6e543c 2052 if (r < 0) {
3c098014
ZJS
2053 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m",
2054 seccomp_arch_to_string(arch));
aecd5ac6 2055 continue;
9e6e543c 2056 }
aecd5ac6
TM
2057
2058 r = seccomp_rule_add_exact(
2059 seccomp,
2060 SCMP_ACT_ERRNO(EPERM),
2061 SCMP_SYS(setdomainname),
2062 0);
9e6e543c 2063 if (r < 0) {
3c098014
ZJS
2064 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m",
2065 seccomp_arch_to_string(arch));
aecd5ac6 2066 continue;
9e6e543c 2067 }
aecd5ac6
TM
2068
2069 r = seccomp_load(seccomp);
3c098014
ZJS
2070 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2071 return r;
2072 if (r < 0)
2073 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m",
2074 seccomp_arch_to_string(arch));
aecd5ac6
TM
2075 }
2076
2077 return 0;
2078}
3c27973b 2079
da4dc9a6
ZJS
2080static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
2081 /* Checks the mode_t parameter of the following system calls:
2082 *
2083 * → chmod() + fchmod() + fchmodat()
2084 * → open() + creat() + openat()
2085 * → mkdir() + mkdirat()
2086 * → mknod() + mknodat()
2087 *
2088 * Returns error if *everything* failed, and 0 otherwise.
2089 */
6d95e7d9 2090 int r;
da4dc9a6
ZJS
2091 bool any = false;
2092
2093 r = seccomp_rule_add_exact(
2094 seccomp,
2095 SCMP_ACT_ERRNO(EPERM),
2096 SCMP_SYS(chmod),
2097 1,
2098 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2099 if (r < 0)
2100 log_debug_errno(r, "Failed to add filter for chmod: %m");
2101 else
2102 any = true;
2103
2104 r = seccomp_rule_add_exact(
2105 seccomp,
2106 SCMP_ACT_ERRNO(EPERM),
2107 SCMP_SYS(fchmod),
2108 1,
2109 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2110 if (r < 0)
2111 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2112 else
2113 any = true;
2114
2115 r = seccomp_rule_add_exact(
2116 seccomp,
2117 SCMP_ACT_ERRNO(EPERM),
2118 SCMP_SYS(fchmodat),
2119 1,
2120 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2121 if (r < 0)
2122 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2123 else
2124 any = true;
2125
2126 r = seccomp_rule_add_exact(
2127 seccomp,
2128 SCMP_ACT_ERRNO(EPERM),
2129 SCMP_SYS(mkdir),
2130 1,
2131 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2132 if (r < 0)
2133 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2134 else
2135 any = true;
2136
2137 r = seccomp_rule_add_exact(
2138 seccomp,
2139 SCMP_ACT_ERRNO(EPERM),
2140 SCMP_SYS(mkdirat),
2141 1,
2142 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2143 if (r < 0)
2144 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2145 else
2146 any = true;
2147
2148 r = seccomp_rule_add_exact(
2149 seccomp,
2150 SCMP_ACT_ERRNO(EPERM),
2151 SCMP_SYS(mknod),
2152 1,
2153 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2154 if (r < 0)
2155 log_debug_errno(r, "Failed to add filter for mknod: %m");
2156 else
2157 any = true;
2158
2159 r = seccomp_rule_add_exact(
2160 seccomp,
2161 SCMP_ACT_ERRNO(EPERM),
2162 SCMP_SYS(mknodat),
2163 1,
2164 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2165 if (r < 0)
2166 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2167 else
2168 any = true;
2169
da4dc9a6
ZJS
2170 r = seccomp_rule_add_exact(
2171 seccomp,
2172 SCMP_ACT_ERRNO(EPERM),
2173 SCMP_SYS(open),
2174 2,
2175 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2176 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2177 if (r < 0)
2178 log_debug_errno(r, "Failed to add filter for open: %m");
2179 else
2180 any = true;
da4dc9a6
ZJS
2181
2182 r = seccomp_rule_add_exact(
2183 seccomp,
2184 SCMP_ACT_ERRNO(EPERM),
2185 SCMP_SYS(openat),
2186 2,
2187 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2188 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2189 if (r < 0)
2190 log_debug_errno(r, "Failed to add filter for openat: %m");
2191 else
2192 any = true;
2193
ecc04067
LP
2194#if defined(__SNR_openat2)
2195 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2196 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2197 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
57353d29
MG
2198 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2199 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2200 * to call open() or openat() instead. We can properly enforce policy for those functions. */
ecc04067
LP
2201 r = seccomp_rule_add_exact(
2202 seccomp,
57353d29 2203 SCMP_ACT_ERRNO(ENOSYS),
ecc04067
LP
2204 SCMP_SYS(openat2),
2205 0);
2206 if (r < 0)
2207 log_debug_errno(r, "Failed to add filter for openat2: %m");
2208 else
2209 any = true;
2210#endif
2211
da4dc9a6
ZJS
2212 r = seccomp_rule_add_exact(
2213 seccomp,
2214 SCMP_ACT_ERRNO(EPERM),
2215 SCMP_SYS(creat),
2216 1,
2217 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2218 if (r < 0)
2219 log_debug_errno(r, "Failed to add filter for creat: %m");
2220 else
2221 any = true;
2222
2223 return any ? 0 : r;
2224}
2225
3c27973b
LP
2226int seccomp_restrict_suid_sgid(void) {
2227 uint32_t arch;
da4dc9a6 2228 int r, k;
3c27973b
LP
2229
2230 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2231 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2232
2233 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2234 if (r < 0)
2235 return r;
2236
da4dc9a6
ZJS
2237 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2238 if (r < 0)
3c098014
ZJS
2239 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m",
2240 seccomp_arch_to_string(arch));
3c27973b 2241
da4dc9a6
ZJS
2242 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2243 if (k < 0)
3c098014
ZJS
2244 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m",
2245 seccomp_arch_to_string(arch));
3c27973b 2246
da4dc9a6 2247 if (r < 0 && k < 0)
3c27973b 2248 continue;
3c27973b
LP
2249
2250 r = seccomp_load(seccomp);
3c098014
ZJS
2251 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2252 return r;
2253 if (r < 0)
2254 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m",
2255 seccomp_arch_to_string(arch));
3c27973b
LP
2256 }
2257
2258 return 0;
2259}
915fb324
LP
2260
2261uint32_t scmp_act_kill_process(void) {
2262
2263 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2264 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2265 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2266 * for single-threaded apps does the right thing. */
2267
2268#ifdef SCMP_ACT_KILL_PROCESS
2269 if (seccomp_api_get() >= 3)
2270 return SCMP_ACT_KILL_PROCESS;
2271#endif
2272
2273 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2274}
22eadc28
YW
2275
2276int parse_syscall_and_errno(const char *in, char **name, int *error) {
2277 _cleanup_free_ char *n = NULL;
2278 char *p;
2279 int e = -1;
2280
2281 assert(in);
2282 assert(name);
2283 assert(error);
2284
2285 /*
2286 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2287 * If errno is omitted, then error is set to -1.
2288 * Empty syscall name is not allowed.
2289 * Here, we do not check that the syscall name is valid or not.
2290 */
2291
2292 p = strchr(in, ':');
2293 if (p) {
2294 e = seccomp_parse_errno_or_action(p + 1);
2295 if (e < 0)
2296 return e;
2297
2298 n = strndup(in, p - in);
2299 } else
2300 n = strdup(in);
2301
2302 if (!n)
2303 return -ENOMEM;
2304
2305 if (isempty(n))
2306 return -EINVAL;
2307
2308 *error = e;
2309 *name = TAKE_PTR(n);
2310
2311 return 0;
2312}
4a4654e0
LP
2313
2314static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2315 bool any = false;
2316 int r;
2317
2318 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2319 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2320
4a4654e0
LP
2321 r = seccomp_rule_add_exact(
2322 seccomp,
2323 SCMP_ACT_ERRNO(EINVAL),
2324 SCMP_SYS(open),
2325 1,
2326 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2327 if (r < 0)
2328 log_debug_errno(r, "Failed to add filter for open: %m");
2329 else
2330 any = true;
4a4654e0
LP
2331
2332 r = seccomp_rule_add_exact(
2333 seccomp,
2334 SCMP_ACT_ERRNO(EINVAL),
2335 SCMP_SYS(openat),
2336 1,
2337 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2338 if (r < 0)
2339 log_debug_errno(r, "Failed to add filter for openat: %m");
2340 else
2341 any = true;
2342
2343#if defined(__SNR_openat2)
2344 /* The new openat2() system call can't be filtered sensibly, see above. */
2345 r = seccomp_rule_add_exact(
2346 seccomp,
2347 SCMP_ACT_ERRNO(ENOSYS),
2348 SCMP_SYS(openat2),
2349 0);
2350 if (r < 0)
2351 log_debug_errno(r, "Failed to add filter for openat2: %m");
2352 else
2353 any = true;
2354#endif
2355
2356 return any ? 0 : r;
2357}
2358
2359int seccomp_suppress_sync(void) {
2360 uint32_t arch;
2361 int r;
2362
2363 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2364 * manageable, and also masks O_SYNC/O_DSYNC */
2365
2366 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2367 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
4a4654e0
LP
2368
2369 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2370 if (r < 0)
2371 return r;
2372
2373 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2374 int id;
2375
2376 id = seccomp_syscall_resolve_name(c);
2377 if (id == __NR_SCMP_ERROR) {
2378 log_debug("System call %s is not known, ignoring.", c);
2379 continue;
2380 }
2381
2382 r = seccomp_rule_add_exact(
2383 seccomp,
2384 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2385 id,
2386 0);
2387 if (r < 0)
2388 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2389 }
2390
2391 (void) block_open_flag(seccomp, O_SYNC);
2392#if O_DSYNC != O_SYNC
2393 (void) block_open_flag(seccomp, O_DSYNC);
2394#endif
2395
2396 r = seccomp_load(seccomp);
3c098014
ZJS
2397 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2398 return r;
2399 if (r < 0)
2400 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m",
2401 seccomp_arch_to_string(arch));
4a4654e0
LP
2402 }
2403
2404 return 0;
2405}