]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
core: when applying syscall filters, use ENOSYS for unknown calls
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
a8fbdf54 6#include <stddef.h>
469830d1 7#include <sys/mman.h>
d347d902 8#include <sys/prctl.h>
469830d1 9#include <sys/shm.h>
3c27973b 10#include <sys/stat.h>
57183d11 11
e83156c2
YW
12/* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13#include "missing_syscall_def.h"
14#include <seccomp.h>
15
469830d1 16#include "af-list.h"
add00535 17#include "alloc-util.h"
44aaddad 18#include "env-util.h"
d8b4d14d 19#include "errno-list.h"
a8fbdf54 20#include "macro.h"
241b1577 21#include "namespace-util.h"
add00535 22#include "nsflags.h"
d8b4d14d 23#include "nulstr-util.h"
78e864e5 24#include "process-util.h"
cf0fbc49 25#include "seccomp-util.h"
b16bd535 26#include "set.h"
07630cea 27#include "string-util.h"
b16bd535 28#include "strv.h"
469830d1 29
65976868
GDF
30/* This array will be modified at runtime as seccomp_restrict_archs is called. */
31uint32_t seccomp_local_archs[] = {
469830d1 32
6b000af4 33 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
34
35#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
36 SCMP_ARCH_X86,
37 SCMP_ARCH_X86_64,
f2d9751c
LP
38 SCMP_ARCH_X32, /* native */
39#elif defined(__x86_64__) && !defined(__ILP32__)
40 SCMP_ARCH_X86,
469830d1 41 SCMP_ARCH_X32,
f2d9751c
LP
42 SCMP_ARCH_X86_64, /* native */
43#elif defined(__i386__)
44 SCMP_ARCH_X86,
45#elif defined(__aarch64__)
469830d1 46 SCMP_ARCH_ARM,
f2d9751c
LP
47 SCMP_ARCH_AARCH64, /* native */
48#elif defined(__arm__)
49 SCMP_ARCH_ARM,
f9d3fb6b
XW
50#elif defined(__loongarch_lp64)
51 SCMP_ARCH_LOONGARCH64,
f2d9751c
LP
52#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
53 SCMP_ARCH_MIPSEL,
54 SCMP_ARCH_MIPS, /* native */
55#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 56 SCMP_ARCH_MIPS,
f2d9751c
LP
57 SCMP_ARCH_MIPSEL, /* native */
58#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS,
61 SCMP_ARCH_MIPSEL64N32,
469830d1 62 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
63 SCMP_ARCH_MIPSEL64,
64 SCMP_ARCH_MIPS64, /* native */
65#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPS,
469830d1 67 SCMP_ARCH_MIPSEL,
f2d9751c
LP
68 SCMP_ARCH_MIPS64N32,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64,
71 SCMP_ARCH_MIPSEL64, /* native */
72#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS,
469830d1 75 SCMP_ARCH_MIPSEL64,
f2d9751c 76 SCMP_ARCH_MIPS64,
469830d1 77 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
78 SCMP_ARCH_MIPS64N32, /* native */
79#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPS,
81 SCMP_ARCH_MIPSEL,
82 SCMP_ARCH_MIPS64,
83 SCMP_ARCH_MIPSEL64,
84 SCMP_ARCH_MIPS64N32,
85 SCMP_ARCH_MIPSEL64N32, /* native */
344e6b62
SJ
86#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
87 SCMP_ARCH_PARISC,
88 SCMP_ARCH_PARISC64, /* native */
89#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
90 SCMP_ARCH_PARISC,
f2d9751c 91#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 92 SCMP_ARCH_PPC,
469830d1 93 SCMP_ARCH_PPC64LE,
f2d9751c
LP
94 SCMP_ARCH_PPC64, /* native */
95#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
96 SCMP_ARCH_PPC,
97 SCMP_ARCH_PPC64,
98 SCMP_ARCH_PPC64LE, /* native */
99#elif defined(__powerpc__)
100 SCMP_ARCH_PPC,
f9252236
AJ
101#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
102 SCMP_ARCH_RISCV64,
f2d9751c
LP
103#elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106#elif defined(__s390__)
469830d1 107 SCMP_ARCH_S390,
469830d1 108#endif
65976868 109 SECCOMP_LOCAL_ARCH_END
469830d1 110 };
57183d11
LP
111
112const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
57183d11 117
79893116 118 switch (c) {
aa34055f 119 case SCMP_ARCH_NATIVE:
57183d11 120 return "native";
aa34055f 121 case SCMP_ARCH_X86:
57183d11 122 return "x86";
aa34055f 123 case SCMP_ARCH_X86_64:
57183d11 124 return "x86-64";
aa34055f 125 case SCMP_ARCH_X32:
57183d11 126 return "x32";
aa34055f 127 case SCMP_ARCH_ARM:
57183d11 128 return "arm";
aa34055f
ZJS
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
f9d3fb6b
XW
131#ifdef SCMP_ARCH_LOONGARCH64
132 case SCMP_ARCH_LOONGARCH64:
133 return "loongarch64";
134#endif
aa34055f
ZJS
135 case SCMP_ARCH_MIPS:
136 return "mips";
137 case SCMP_ARCH_MIPS64:
138 return "mips64";
139 case SCMP_ARCH_MIPS64N32:
140 return "mips64-n32";
141 case SCMP_ARCH_MIPSEL:
142 return "mips-le";
143 case SCMP_ARCH_MIPSEL64:
144 return "mips64-le";
145 case SCMP_ARCH_MIPSEL64N32:
146 return "mips64-le-n32";
344e6b62
SJ
147#ifdef SCMP_ARCH_PARISC
148 case SCMP_ARCH_PARISC:
149 return "parisc";
150#endif
151#ifdef SCMP_ARCH_PARISC64
152 case SCMP_ARCH_PARISC64:
153 return "parisc64";
154#endif
aa34055f
ZJS
155 case SCMP_ARCH_PPC:
156 return "ppc";
157 case SCMP_ARCH_PPC64:
158 return "ppc64";
159 case SCMP_ARCH_PPC64LE:
160 return "ppc64-le";
f9252236
AJ
161#ifdef SCMP_ARCH_RISCV64
162 case SCMP_ARCH_RISCV64:
163 return "riscv64";
164#endif
aa34055f 165 case SCMP_ARCH_S390:
6abfd303 166 return "s390";
aa34055f 167 case SCMP_ARCH_S390X:
6abfd303 168 return "s390x";
aa34055f
ZJS
169 default:
170 return NULL;
171 }
57183d11
LP
172}
173
174int seccomp_arch_from_string(const char *n, uint32_t *ret) {
175 if (!n)
176 return -EINVAL;
177
178 assert(ret);
179
180 if (streq(n, "native"))
181 *ret = SCMP_ARCH_NATIVE;
182 else if (streq(n, "x86"))
183 *ret = SCMP_ARCH_X86;
184 else if (streq(n, "x86-64"))
185 *ret = SCMP_ARCH_X86_64;
186 else if (streq(n, "x32"))
187 *ret = SCMP_ARCH_X32;
188 else if (streq(n, "arm"))
189 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
190 else if (streq(n, "arm64"))
191 *ret = SCMP_ARCH_AARCH64;
f9d3fb6b
XW
192#ifdef SCMP_ARCH_LOONGARCH64
193 else if (streq(n, "loongarch64"))
194 *ret = SCMP_ARCH_LOONGARCH64;
195#endif
aa34055f
ZJS
196 else if (streq(n, "mips"))
197 *ret = SCMP_ARCH_MIPS;
198 else if (streq(n, "mips64"))
199 *ret = SCMP_ARCH_MIPS64;
200 else if (streq(n, "mips64-n32"))
201 *ret = SCMP_ARCH_MIPS64N32;
202 else if (streq(n, "mips-le"))
203 *ret = SCMP_ARCH_MIPSEL;
204 else if (streq(n, "mips64-le"))
205 *ret = SCMP_ARCH_MIPSEL64;
206 else if (streq(n, "mips64-le-n32"))
207 *ret = SCMP_ARCH_MIPSEL64N32;
344e6b62
SJ
208#ifdef SCMP_ARCH_PARISC
209 else if (streq(n, "parisc"))
210 *ret = SCMP_ARCH_PARISC;
211#endif
212#ifdef SCMP_ARCH_PARISC64
213 else if (streq(n, "parisc64"))
214 *ret = SCMP_ARCH_PARISC64;
215#endif
aa34055f
ZJS
216 else if (streq(n, "ppc"))
217 *ret = SCMP_ARCH_PPC;
218 else if (streq(n, "ppc64"))
219 *ret = SCMP_ARCH_PPC64;
220 else if (streq(n, "ppc64-le"))
221 *ret = SCMP_ARCH_PPC64LE;
f9252236
AJ
222#ifdef SCMP_ARCH_RISCV64
223 else if (streq(n, "riscv64"))
224 *ret = SCMP_ARCH_RISCV64;
225#endif
6abfd303
HB
226 else if (streq(n, "s390"))
227 *ret = SCMP_ARCH_S390;
228 else if (streq(n, "s390x"))
229 *ret = SCMP_ARCH_S390X;
57183d11
LP
230 else
231 return -EINVAL;
232
233 return 0;
234}
e9642be2 235
469830d1 236int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 237 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
238 int r;
239
469830d1
LP
240 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
241 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
242
243 seccomp = seccomp_init(default_action);
244 if (!seccomp)
245 return -ENOMEM;
246
469830d1
LP
247 if (arch != SCMP_ARCH_NATIVE &&
248 arch != seccomp_arch_native()) {
249
1b52793d 250 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 251 if (r < 0)
b4eaa6cc 252 return r;
469830d1 253
1b52793d 254 r = seccomp_arch_add(seccomp, arch);
469830d1 255 if (r < 0)
b4eaa6cc 256 return r;
469830d1
LP
257
258 assert(seccomp_arch_exist(seccomp, arch) >= 0);
259 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
260 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
261 } else {
262 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
263 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
264 }
265
266 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 267 if (r < 0)
b4eaa6cc 268 return r;
8d7b0c8f
LP
269
270 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
271 if (r < 0)
b4eaa6cc 272 return r;
8d7b0c8f 273
44aaddad
SD
274#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
275 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
276 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
277 if (r < 0)
278 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
279 }
280#endif
281
b4eaa6cc 282 *ret = TAKE_PTR(seccomp);
8d7b0c8f 283 return 0;
8d7b0c8f
LP
284}
285
d347d902 286static bool is_basic_seccomp_available(void) {
4d5bd50a 287 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
288}
289
290static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
291 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
292 errno == EFAULT;
d347d902
FS
293}
294
83f12b27 295bool is_seccomp_available(void) {
83f12b27 296 static int cached_enabled = -1;
4d5bd50a 297
ce8f6d47
LP
298 if (cached_enabled < 0) {
299 int b;
300
301 b = getenv_bool_secure("SYSTEMD_SECCOMP");
302 if (b != 0) {
303 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
304 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
305
306 cached_enabled =
307 is_basic_seccomp_available() &&
308 is_seccomp_filter_available();
309 } else
310 cached_enabled = false;
311 }
4d5bd50a 312
83f12b27
FS
313 return cached_enabled;
314}
315
8130926d 316const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 317 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 318 .name = "@default",
d5efc18b 319 .help = "System calls that are always permitted",
40eb6a80 320 .value =
5f02870a 321 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
5abede32 322 "brk\0"
8e24b1d2 323 "cacheflush\0"
40eb6a80 324 "clock_getres\0"
6ca67710 325 "clock_getres_time64\0"
40eb6a80 326 "clock_gettime\0"
6ca67710 327 "clock_gettime64\0"
40eb6a80 328 "clock_nanosleep\0"
6ca67710 329 "clock_nanosleep_time64\0"
40eb6a80
ZJS
330 "execve\0"
331 "exit\0"
332 "exit_group\0"
e41b0f42 333 "futex\0"
6ca67710 334 "futex_time64\0"
76e86b8d 335 "futex_waitv\0"
e41b0f42
LP
336 "get_robust_list\0"
337 "get_thread_area\0"
09d3020b
DH
338 "getegid\0"
339 "getegid32\0"
340 "geteuid\0"
341 "geteuid32\0"
342 "getgid\0"
343 "getgid32\0"
344 "getgroups\0"
345 "getgroups32\0"
346 "getpgid\0"
347 "getpgrp\0"
348 "getpid\0"
349 "getppid\0"
14f4b1b5 350 "getrandom\0"
09d3020b
DH
351 "getresgid\0"
352 "getresgid32\0"
353 "getresuid\0"
354 "getresuid32\0"
40eb6a80 355 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
356 "getsid\0"
357 "gettid\0"
40eb6a80 358 "gettimeofday\0"
09d3020b
DH
359 "getuid\0"
360 "getuid32\0"
e41b0f42 361 "membarrier\0"
5abede32
LP
362 "mmap\0"
363 "mmap2\0"
47286254 364 "mprotect\0"
11b9105d 365 "munmap\0"
40eb6a80
ZJS
366 "nanosleep\0"
367 "pause\0"
4c3a9176 368 "prlimit64\0"
e41b0f42 369 "restart_syscall\0"
09925036 370 "riscv_flush_icache\0"
ca15fc48 371 "riscv_hwprobe\0"
6fee3be0 372 "rseq\0"
40eb6a80 373 "rt_sigreturn\0"
7df660e4 374 "sched_getaffinity\0"
8f44de08 375 "sched_yield\0"
e41b0f42
LP
376 "set_robust_list\0"
377 "set_thread_area\0"
378 "set_tid_address\0"
ce5faeac 379 "set_tls\0"
40eb6a80
ZJS
380 "sigreturn\0"
381 "time\0"
4c3a9176 382 "ugetrlimit\0"
40eb6a80 383 },
44898c53
LP
384 [SYSCALL_FILTER_SET_AIO] = {
385 .name = "@aio",
386 .help = "Asynchronous IO",
387 .value =
388 "io_cancel\0"
389 "io_destroy\0"
390 "io_getevents\0"
a05cfe23 391 "io_pgetevents\0"
6ca67710 392 "io_pgetevents_time64\0"
44898c53
LP
393 "io_setup\0"
394 "io_submit\0"
9e486265
LP
395 "io_uring_enter\0"
396 "io_uring_register\0"
397 "io_uring_setup\0"
44898c53 398 },
133ddbbe 399 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 400 .name = "@basic-io",
d5efc18b 401 .help = "Basic IO",
133ddbbe 402 .value =
648a0ed0 403 "_llseek\0"
133ddbbe 404 "close\0"
6ea0d25c 405 "close_range\0"
648a0ed0 406 "dup\0"
133ddbbe
LP
407 "dup2\0"
408 "dup3\0"
133ddbbe
LP
409 "lseek\0"
410 "pread64\0"
411 "preadv\0"
44898c53 412 "preadv2\0"
133ddbbe
LP
413 "pwrite64\0"
414 "pwritev\0"
44898c53 415 "pwritev2\0"
133ddbbe
LP
416 "read\0"
417 "readv\0"
418 "write\0"
419 "writev\0"
420 },
44898c53
LP
421 [SYSCALL_FILTER_SET_CHOWN] = {
422 .name = "@chown",
423 .help = "Change ownership of files and directories",
424 .value =
425 "chown\0"
426 "chown32\0"
427 "fchown\0"
428 "fchown32\0"
429 "fchownat\0"
430 "lchown\0"
431 "lchown32\0"
432 },
8130926d 433 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 434 .name = "@clock",
d5efc18b 435 .help = "Change the system time",
201c1cc2
TM
436 .value =
437 "adjtimex\0"
1f9ac68b 438 "clock_adjtime\0"
6ca67710 439 "clock_adjtime64\0"
1f9ac68b 440 "clock_settime\0"
6ca67710 441 "clock_settime64\0"
201c1cc2 442 "settimeofday\0"
8130926d
LP
443 },
444 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 445 .name = "@cpu-emulation",
d5efc18b 446 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
447 .value =
448 "modify_ldt\0"
449 "subpage_prot\0"
450 "switch_endian\0"
451 "vm86\0"
452 "vm86old\0"
8130926d
LP
453 },
454 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 455 .name = "@debug",
d5efc18b 456 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
457 .value =
458 "lookup_dcookie\0"
459 "perf_event_open\0"
8270e3d8 460 "pidfd_getfd\0"
1f9ac68b
LP
461 "ptrace\0"
462 "rtas\0"
463 "s390_runtime_instr\0"
464 "sys_debug_setcontext\0"
8130926d 465 },
1a1b13c9
LP
466 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
467 .name = "@file-system",
468 .help = "File system operations",
469 .value =
470 "access\0"
471 "chdir\0"
472 "chmod\0"
473 "close\0"
474 "creat\0"
475 "faccessat\0"
bcf08acb 476 "faccessat2\0"
1a1b13c9
LP
477 "fallocate\0"
478 "fchdir\0"
479 "fchmod\0"
480 "fchmodat\0"
6e10405a 481 "fchmodat2\0"
1a1b13c9 482 "fcntl\0"
ceaa6aa7 483 "fcntl64\0"
1a1b13c9
LP
484 "fgetxattr\0"
485 "flistxattr\0"
ceaa6aa7 486 "fremovexattr\0"
1a1b13c9 487 "fsetxattr\0"
1a1b13c9 488 "fstat\0"
ceaa6aa7 489 "fstat64\0"
1a1b13c9 490 "fstatat64\0"
1a1b13c9 491 "fstatfs\0"
ceaa6aa7 492 "fstatfs64\0"
1a1b13c9 493 "ftruncate\0"
ceaa6aa7 494 "ftruncate64\0"
1a1b13c9
LP
495 "futimesat\0"
496 "getcwd\0"
1a1b13c9 497 "getdents\0"
ceaa6aa7 498 "getdents64\0"
1a1b13c9
LP
499 "getxattr\0"
500 "inotify_add_watch\0"
ceaa6aa7 501 "inotify_init\0"
1a1b13c9
LP
502 "inotify_init1\0"
503 "inotify_rm_watch\0"
504 "lgetxattr\0"
505 "link\0"
506 "linkat\0"
507 "listxattr\0"
508 "llistxattr\0"
509 "lremovexattr\0"
510 "lsetxattr\0"
1a1b13c9 511 "lstat\0"
ceaa6aa7 512 "lstat64\0"
1a1b13c9
LP
513 "mkdir\0"
514 "mkdirat\0"
515 "mknod\0"
516 "mknodat\0"
1a1b13c9 517 "newfstatat\0"
ceaa6aa7
LP
518 "oldfstat\0"
519 "oldlstat\0"
520 "oldstat\0"
1a1b13c9
LP
521 "open\0"
522 "openat\0"
8270e3d8 523 "openat2\0"
1a1b13c9
LP
524 "readlink\0"
525 "readlinkat\0"
526 "removexattr\0"
527 "rename\0"
1a1b13c9 528 "renameat\0"
ceaa6aa7 529 "renameat2\0"
1a1b13c9
LP
530 "rmdir\0"
531 "setxattr\0"
1a1b13c9 532 "stat\0"
ceaa6aa7 533 "stat64\0"
1a1b13c9 534 "statfs\0"
ceaa6aa7 535 "statfs64\0"
a4135a74 536 "statx\0"
1a1b13c9
LP
537 "symlink\0"
538 "symlinkat\0"
1a1b13c9 539 "truncate\0"
ceaa6aa7 540 "truncate64\0"
1a1b13c9
LP
541 "unlink\0"
542 "unlinkat\0"
ceaa6aa7 543 "utime\0"
1a1b13c9 544 "utimensat\0"
6ca67710 545 "utimensat_time64\0"
1a1b13c9
LP
546 "utimes\0"
547 },
8130926d 548 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 549 .name = "@io-event",
d5efc18b 550 .help = "Event loop system calls",
201c1cc2
TM
551 .value =
552 "_newselect\0"
201c1cc2 553 "epoll_create\0"
215728ff 554 "epoll_create1\0"
201c1cc2
TM
555 "epoll_ctl\0"
556 "epoll_ctl_old\0"
557 "epoll_pwait\0"
34254e59 558 "epoll_pwait2\0"
201c1cc2
TM
559 "epoll_wait\0"
560 "epoll_wait_old\0"
201c1cc2 561 "eventfd\0"
215728ff 562 "eventfd2\0"
201c1cc2
TM
563 "poll\0"
564 "ppoll\0"
6ca67710 565 "ppoll_time64\0"
201c1cc2 566 "pselect6\0"
6ca67710 567 "pselect6_time64\0"
201c1cc2 568 "select\0"
8130926d
LP
569 },
570 [SYSCALL_FILTER_SET_IPC] = {
8130926d 571 .name = "@ipc",
d5efc18b
ZJS
572 .help = "SysV IPC, POSIX Message Queues or other IPC",
573 .value =
574 "ipc\0"
cd5bfd7e 575 "memfd_create\0"
201c1cc2
TM
576 "mq_getsetattr\0"
577 "mq_notify\0"
578 "mq_open\0"
579 "mq_timedreceive\0"
6ca67710 580 "mq_timedreceive_time64\0"
201c1cc2 581 "mq_timedsend\0"
6ca67710 582 "mq_timedsend_time64\0"
201c1cc2
TM
583 "mq_unlink\0"
584 "msgctl\0"
585 "msgget\0"
586 "msgrcv\0"
587 "msgsnd\0"
cd5bfd7e 588 "pipe\0"
215728ff 589 "pipe2\0"
34254e59 590 "process_madvise\0"
201c1cc2
TM
591 "process_vm_readv\0"
592 "process_vm_writev\0"
593 "semctl\0"
594 "semget\0"
595 "semop\0"
596 "semtimedop\0"
6ca67710 597 "semtimedop_time64\0"
201c1cc2
TM
598 "shmat\0"
599 "shmctl\0"
600 "shmdt\0"
601 "shmget\0"
8130926d
LP
602 },
603 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 604 .name = "@keyring",
d5efc18b 605 .help = "Kernel keyring access",
1f9ac68b
LP
606 .value =
607 "add_key\0"
608 "keyctl\0"
609 "request_key\0"
8130926d 610 },
cd0ddf6f
LP
611 [SYSCALL_FILTER_SET_MEMLOCK] = {
612 .name = "@memlock",
613 .help = "Memory locking control",
614 .value =
615 "mlock\0"
616 "mlock2\0"
617 "mlockall\0"
618 "munlock\0"
619 "munlockall\0"
620 },
8130926d 621 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 622 .name = "@module",
d5efc18b 623 .help = "Loading and unloading of kernel modules",
201c1cc2 624 .value =
201c1cc2
TM
625 "delete_module\0"
626 "finit_module\0"
627 "init_module\0"
8130926d
LP
628 },
629 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 630 .name = "@mount",
d5efc18b 631 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
632 .value =
633 "chroot\0"
9e486265
LP
634 "fsconfig\0"
635 "fsmount\0"
636 "fsopen\0"
637 "fspick\0"
201c1cc2 638 "mount\0"
34254e59 639 "mount_setattr\0"
9e486265
LP
640 "move_mount\0"
641 "open_tree\0"
201c1cc2 642 "pivot_root\0"
201c1cc2 643 "umount\0"
215728ff 644 "umount2\0"
8130926d
LP
645 },
646 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 647 .name = "@network-io",
d5efc18b 648 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 649 .value =
201c1cc2 650 "accept\0"
215728ff 651 "accept4\0"
201c1cc2
TM
652 "bind\0"
653 "connect\0"
654 "getpeername\0"
655 "getsockname\0"
656 "getsockopt\0"
657 "listen\0"
658 "recv\0"
659 "recvfrom\0"
660 "recvmmsg\0"
6ca67710 661 "recvmmsg_time64\0"
201c1cc2
TM
662 "recvmsg\0"
663 "send\0"
664 "sendmmsg\0"
665 "sendmsg\0"
666 "sendto\0"
667 "setsockopt\0"
668 "shutdown\0"
669 "socket\0"
670 "socketcall\0"
671 "socketpair\0"
8130926d
LP
672 },
673 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 674 /* some unknown even to libseccomp */
8130926d 675 .name = "@obsolete",
d5efc18b 676 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
677 .value =
678 "_sysctl\0"
679 "afs_syscall\0"
802fa07a 680 "bdflush\0"
201c1cc2 681 "break\0"
1f9ac68b 682 "create_module\0"
201c1cc2
TM
683 "ftime\0"
684 "get_kernel_syms\0"
201c1cc2
TM
685 "getpmsg\0"
686 "gtty\0"
7e0c3b8f 687 "idle\0"
201c1cc2 688 "lock\0"
201c1cc2 689 "mpx\0"
201c1cc2
TM
690 "prof\0"
691 "profil\0"
201c1cc2
TM
692 "putpmsg\0"
693 "query_module\0"
201c1cc2
TM
694 "security\0"
695 "sgetmask\0"
696 "ssetmask\0"
ae5e9bf4 697 "stime\0"
201c1cc2 698 "stty\0"
1f9ac68b 699 "sysfs\0"
201c1cc2
TM
700 "tuxcall\0"
701 "ulimit\0"
702 "uselib\0"
1f9ac68b 703 "ustat\0"
201c1cc2 704 "vserver\0"
8130926d 705 },
9493b168
ZJS
706 [SYSCALL_FILTER_SET_PKEY] = {
707 .name = "@pkey",
708 .help = "System calls used for memory protection keys",
709 .value =
710 "pkey_alloc\0"
711 "pkey_free\0"
712 "pkey_mprotect\0"
713 },
8130926d 714 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 715 .name = "@privileged",
d5efc18b 716 .help = "All system calls which need super-user capabilities",
201c1cc2 717 .value =
44898c53 718 "@chown\0"
201c1cc2
TM
719 "@clock\0"
720 "@module\0"
721 "@raw-io\0"
af0f047b
LP
722 "@reboot\0"
723 "@swap\0"
215728ff 724 "_sysctl\0"
201c1cc2 725 "acct\0"
201c1cc2 726 "bpf\0"
1f9ac68b 727 "capset\0"
201c1cc2 728 "chroot\0"
a05cfe23 729 "fanotify_init\0"
9e486265 730 "fanotify_mark\0"
201c1cc2 731 "nfsservctl\0"
a05cfe23 732 "open_by_handle_at\0"
201c1cc2
TM
733 "pivot_root\0"
734 "quotactl\0"
76e86b8d 735 "quotactl_fd\0"
201c1cc2 736 "setdomainname\0"
201c1cc2 737 "setfsuid\0"
215728ff 738 "setfsuid32\0"
201c1cc2 739 "setgroups\0"
215728ff 740 "setgroups32\0"
201c1cc2 741 "sethostname\0"
201c1cc2 742 "setresuid\0"
215728ff 743 "setresuid32\0"
201c1cc2 744 "setreuid\0"
215728ff 745 "setreuid32\0"
e05ee49b 746 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 747 "setuid32\0"
201c1cc2 748 "vhangup\0"
8130926d
LP
749 },
750 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 751 .name = "@process",
7b121df6 752 .help = "Process control, execution, namespacing operations",
201c1cc2 753 .value =
09d3020b 754 "capget\0" /* Able to query arbitrary processes */
201c1cc2 755 "clone\0"
c5503601
ZJS
756 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
757 * implement seccomp, so we don't need to list it at all. C.f.
758 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
9e486265 759 "clone3\0"
201c1cc2
TM
760 "execveat\0"
761 "fork\0"
b887d2eb 762 "getrusage\0"
201c1cc2 763 "kill\0"
9e486265 764 "pidfd_open\0"
46fcf95d 765 "pidfd_send_signal\0"
201c1cc2 766 "prctl\0"
b887d2eb
LP
767 "rt_sigqueueinfo\0"
768 "rt_tgsigqueueinfo\0"
201c1cc2 769 "setns\0"
a9518dc3 770 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 771 "tgkill\0"
b887d2eb 772 "times\0"
201c1cc2
TM
773 "tkill\0"
774 "unshare\0"
775 "vfork\0"
b887d2eb
LP
776 "wait4\0"
777 "waitid\0"
778 "waitpid\0"
8130926d
LP
779 },
780 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 781 .name = "@raw-io",
d5efc18b 782 .help = "Raw I/O port access",
201c1cc2
TM
783 .value =
784 "ioperm\0"
785 "iopl\0"
1f9ac68b 786 "pciconfig_iobase\0"
201c1cc2
TM
787 "pciconfig_read\0"
788 "pciconfig_write\0"
789 "s390_pci_mmio_read\0"
790 "s390_pci_mmio_write\0"
8130926d 791 },
bd2ab3f4
LP
792 [SYSCALL_FILTER_SET_REBOOT] = {
793 .name = "@reboot",
794 .help = "Reboot and reboot preparation/kexec",
795 .value =
bd2ab3f4 796 "kexec_file_load\0"
e59608fa 797 "kexec_load\0"
bd2ab3f4
LP
798 "reboot\0"
799 },
133ddbbe 800 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 801 .name = "@resources",
58a8f68b 802 .help = "Alter resource settings",
133ddbbe 803 .value =
0963c053
LP
804 "ioprio_set\0"
805 "mbind\0"
806 "migrate_pages\0"
807 "move_pages\0"
808 "nice\0"
0963c053
LP
809 "sched_setaffinity\0"
810 "sched_setattr\0"
133ddbbe
LP
811 "sched_setparam\0"
812 "sched_setscheduler\0"
0963c053 813 "set_mempolicy\0"
76e86b8d 814 "set_mempolicy_home_node\0"
133ddbbe
LP
815 "setpriority\0"
816 "setrlimit\0"
133ddbbe 817 },
d12632a8
LP
818 [SYSCALL_FILTER_SET_SANDBOX] = {
819 .name = "@sandbox",
820 .help = "Sandbox functionality",
821 .value =
822 "landlock_add_rule\0"
823 "landlock_create_ruleset\0"
824 "landlock_restrict_self\0"
825 "seccomp\0"
826 },
6eaaeee9
LP
827 [SYSCALL_FILTER_SET_SETUID] = {
828 .name = "@setuid",
829 .help = "Operations for changing user/group credentials",
830 .value =
6eaaeee9 831 "setgid\0"
215728ff 832 "setgid32\0"
6eaaeee9 833 "setgroups\0"
215728ff 834 "setgroups32\0"
6eaaeee9 835 "setregid\0"
215728ff 836 "setregid32\0"
6eaaeee9 837 "setresgid\0"
215728ff 838 "setresgid32\0"
6eaaeee9 839 "setresuid\0"
215728ff 840 "setresuid32\0"
6eaaeee9 841 "setreuid\0"
215728ff 842 "setreuid32\0"
6eaaeee9 843 "setuid\0"
215728ff 844 "setuid32\0"
6eaaeee9 845 },
cd0ddf6f
LP
846 [SYSCALL_FILTER_SET_SIGNAL] = {
847 .name = "@signal",
848 .help = "Process signal handling",
849 .value =
850 "rt_sigaction\0"
851 "rt_sigpending\0"
852 "rt_sigprocmask\0"
853 "rt_sigsuspend\0"
854 "rt_sigtimedwait\0"
6ca67710 855 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
856 "sigaction\0"
857 "sigaltstack\0"
858 "signal\0"
859 "signalfd\0"
860 "signalfd4\0"
861 "sigpending\0"
862 "sigprocmask\0"
863 "sigsuspend\0"
864 },
bd2ab3f4
LP
865 [SYSCALL_FILTER_SET_SWAP] = {
866 .name = "@swap",
867 .help = "Enable/disable swap devices",
868 .value =
869 "swapoff\0"
870 "swapon\0"
871 },
44898c53
LP
872 [SYSCALL_FILTER_SET_SYNC] = {
873 .name = "@sync",
874 .help = "Synchronize files and memory to storage",
875 .value =
876 "fdatasync\0"
877 "fsync\0"
878 "msync\0"
879 "sync\0"
880 "sync_file_range\0"
a8fb09f5 881 "sync_file_range2\0"
44898c53
LP
882 "syncfs\0"
883 },
70526841
LP
884 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
885 .name = "@system-service",
886 .help = "General system service operations",
887 .value =
888 "@aio\0"
889 "@basic-io\0"
890 "@chown\0"
891 "@default\0"
892 "@file-system\0"
893 "@io-event\0"
894 "@ipc\0"
895 "@keyring\0"
896 "@memlock\0"
897 "@network-io\0"
898 "@process\0"
899 "@resources\0"
900 "@setuid\0"
901 "@signal\0"
902 "@sync\0"
903 "@timer\0"
26b682e8 904 "arm_fadvise64_64\0"
70526841
LP
905 "capget\0"
906 "capset\0"
907 "copy_file_range\0"
908 "fadvise64\0"
909 "fadvise64_64\0"
910 "flock\0"
911 "get_mempolicy\0"
912 "getcpu\0"
913 "getpriority\0"
70526841
LP
914 "ioctl\0"
915 "ioprio_get\0"
916 "kcmp\0"
917 "madvise\0"
70526841
LP
918 "mremap\0"
919 "name_to_handle_at\0"
920 "oldolduname\0"
921 "olduname\0"
922 "personality\0"
923 "readahead\0"
924 "readdir\0"
925 "remap_file_pages\0"
926 "sched_get_priority_max\0"
927 "sched_get_priority_min\0"
70526841
LP
928 "sched_getattr\0"
929 "sched_getparam\0"
930 "sched_getscheduler\0"
931 "sched_rr_get_interval\0"
6ca67710 932 "sched_rr_get_interval_time64\0"
70526841
LP
933 "sched_yield\0"
934 "sendfile\0"
935 "sendfile64\0"
936 "setfsgid\0"
937 "setfsgid32\0"
938 "setfsuid\0"
939 "setfsuid32\0"
940 "setpgid\0"
941 "setsid\0"
942 "splice\0"
943 "sysinfo\0"
944 "tee\0"
945 "umask\0"
946 "uname\0"
947 "userfaultfd\0"
948 "vmsplice\0"
949 },
cd0ddf6f
LP
950 [SYSCALL_FILTER_SET_TIMER] = {
951 .name = "@timer",
952 .help = "Schedule operations by time",
953 .value =
954 "alarm\0"
955 "getitimer\0"
956 "setitimer\0"
957 "timer_create\0"
958 "timer_delete\0"
959 "timer_getoverrun\0"
960 "timer_gettime\0"
6ca67710 961 "timer_gettime64\0"
cd0ddf6f 962 "timer_settime\0"
6ca67710 963 "timer_settime64\0"
cd0ddf6f
LP
964 "timerfd_create\0"
965 "timerfd_gettime\0"
6ca67710 966 "timerfd_gettime64\0"
cd0ddf6f 967 "timerfd_settime\0"
6ca67710 968 "timerfd_settime64\0"
cd0ddf6f
LP
969 "times\0"
970 },
95aac012
ZJS
971 [SYSCALL_FILTER_SET_KNOWN] = {
972 .name = "@known",
973 .help = "All known syscalls declared in the kernel",
974 .value =
6d6a0854 975 "@obsolete\0"
95aac012
ZJS
976#include "syscall-list.h"
977 },
201c1cc2 978};
8130926d
LP
979
980const SyscallFilterSet *syscall_filter_set_find(const char *name) {
8130926d
LP
981 if (isempty(name) || name[0] != '@')
982 return NULL;
983
077e8fc0 984 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
8130926d
LP
985 if (streq(syscall_filter_sets[i].name, name))
986 return syscall_filter_sets + i;
987
988 return NULL;
989}
990
000c0520
ZJS
991static int add_syscall_filter_set(
992 scmp_filter_ctx seccomp,
993 const SyscallFilterSet *set,
994 uint32_t action,
995 char **exclude,
996 bool log_missing,
997 char ***added);
998
999int seccomp_add_syscall_filter_item(
1000 scmp_filter_ctx *seccomp,
1001 const char *name,
1002 uint32_t action,
1003 char **exclude,
1004 bool log_missing,
1005 char ***added) {
69b1b241
LP
1006
1007 assert(seccomp);
1008 assert(name);
1009
960e4569
LP
1010 if (strv_contains(exclude, name))
1011 return 0;
1012
000c0520
ZJS
1013 /* Any syscalls that are handled are added to the *added strv. The pointer
1014 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
1015
69b1b241
LP
1016 if (name[0] == '@') {
1017 const SyscallFilterSet *other;
1018
1019 other = syscall_filter_set_find(name);
baaa35ad
ZJS
1020 if (!other)
1021 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
1022 "Filter set %s is not known!",
1023 name);
69b1b241 1024
000c0520 1025 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
b54f36c6 1026
69b1b241 1027 } else {
b54f36c6 1028 int id, r;
69b1b241
LP
1029
1030 id = seccomp_syscall_resolve_name(name);
cff7bff8 1031 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
1032 if (log_missing)
1033 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 1034 return 0;
cff7bff8 1035 }
69b1b241
LP
1036
1037 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 1038 if (r < 0) {
69b1b241 1039 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
1040 bool ignore = r == -EDOM;
1041
1042 if (!ignore || log_missing)
1043 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1044 name, id, ignore ? ", ignoring" : "");
1045 if (!ignore)
1046 return r;
b54f36c6 1047 }
69b1b241 1048
000c0520
ZJS
1049 if (added) {
1050 r = strv_extend(added, name);
1051 if (r < 0)
1052 return r;
1053 }
1054
b54f36c6
ZJS
1055 return 0;
1056 }
69b1b241
LP
1057}
1058
000c0520 1059static int add_syscall_filter_set(
469830d1 1060 scmp_filter_ctx seccomp,
469830d1 1061 const SyscallFilterSet *set,
960e4569 1062 uint32_t action,
b54f36c6 1063 char **exclude,
000c0520
ZJS
1064 bool log_missing,
1065 char ***added) {
469830d1 1066
8130926d
LP
1067 int r;
1068
000c0520
ZJS
1069 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1070
8130926d
LP
1071 assert(seccomp);
1072 assert(set);
1073
1074 NULSTR_FOREACH(sys, set->value) {
000c0520 1075 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
69b1b241
LP
1076 if (r < 0)
1077 return r;
469830d1
LP
1078 }
1079
1080 return 0;
1081}
1082
b54f36c6 1083int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
1084 uint32_t arch;
1085 int r;
1086
1087 assert(set);
1088
1089 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 1090 * each local arch. */
469830d1
LP
1091
1092 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1093 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1094
30868c1c 1095 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1096
1097 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
1098 if (r < 0)
1099 return r;
469830d1 1100
000c0520 1101 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
7e86bd73
ZJS
1102 if (r < 0)
1103 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
1104
1105 r = seccomp_load(seccomp);
3c098014
ZJS
1106 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1107 return r;
1108 if (r < 0)
1109 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m",
1110 seccomp_arch_to_string(arch));
8130926d
LP
1111 }
1112
1113 return 0;
1114}
a3be2849 1115
1862b310 1116int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
469830d1 1117 uint32_t arch;
a3be2849
LP
1118 int r;
1119
1862b310
YW
1120 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1121 * of a SyscallFilterSet* table. */
a3be2849 1122
1862b310 1123 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
469830d1 1124 return 0;
a3be2849 1125
469830d1
LP
1126 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1127 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b54f36c6 1128 void *syscall_id, *val;
a3be2849 1129
30868c1c 1130 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1131
2331c02d
ZJS
1132 /* We install ENOSYS as the default action, but it will only apply to syscalls which are not
1133 * in the @known set. */
1134 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(ENOSYS));
469830d1
LP
1135 if (r < 0)
1136 return r;
a3be2849 1137
1862b310 1138 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
8cfa775f 1139 uint32_t a = action;
b54f36c6
ZJS
1140 int id = PTR_TO_INT(syscall_id) - 1;
1141 int error = PTR_TO_INT(val);
8cfa775f 1142
005bfaf1
TM
1143 if (error == SECCOMP_ERROR_NUMBER_KILL)
1144 a = scmp_act_kill_process();
9df2cdd8
TM
1145#ifdef SCMP_ACT_LOG
1146 else if (action == SCMP_ACT_LOG)
1147 a = SCMP_ACT_LOG;
1148#endif
68acc1af 1149 else if (error >= 0)
b54f36c6 1150 a = SCMP_ACT_ERRNO(error);
8cfa775f 1151
b54f36c6 1152 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1 1153 if (r < 0) {
1862b310
YW
1154 /* If the system call is not known on this architecture, then that's
1155 * fine, let's ignore it */
469830d1 1156 _cleanup_free_ char *n = NULL;
7e86bd73 1157 bool ignore;
469830d1 1158
b54f36c6 1159 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1160 ignore = r == -EDOM;
1161 if (!ignore || log_missing)
1162 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1163 strna(n), id, ignore ? ", ignoring" : "");
1164 if (!ignore)
1165 return r;
469830d1
LP
1166 }
1167 }
1168
2331c02d
ZJS
1169 NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
1170 int id;
1171
1172 id = seccomp_syscall_resolve_name(name);
1173 if (id < 0)
1174 continue;
1175
1176 /* Ignore the syscall if it was already handled above */
1177 if (hashmap_contains(filter, INT_TO_PTR(id + 1)))
1178 continue;
1179
1180 r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
1181 if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */
1182 return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
1183 name, id);
1184 }
1185
469830d1 1186 r = seccomp_load(seccomp);
3c098014
ZJS
1187 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1188 return r;
1189 if (r < 0)
a52765a5 1190 log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
1862b310 1191 seccomp_arch_to_string(arch));
469830d1
LP
1192 }
1193
1194 return 0;
add00535
LP
1195}
1196
58f6ab44 1197int seccomp_parse_syscall_filter(
898748d8
YW
1198 const char *name,
1199 int errno_num,
1200 Hashmap *filter,
13d92c63 1201 SeccompParseFlags flags,
898748d8
YW
1202 const char *unit,
1203 const char *filename,
1204 unsigned line) {
1205
1206 int r;
1207
1208 assert(name);
1209 assert(filter);
1210
084a46d7
YW
1211 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1212 return -EINVAL;
1213
898748d8
YW
1214 if (name[0] == '@') {
1215 const SyscallFilterSet *set;
898748d8
YW
1216
1217 set = syscall_filter_set_find(name);
1218 if (!set) {
9e29ee40 1219 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1220 return -EINVAL;
13d92c63 1221
9e29ee40 1222 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
13d92c63
LP
1223 "Unknown system call group, ignoring: %s", name);
1224 return 0;
898748d8
YW
1225 }
1226
1227 NULSTR_FOREACH(i, set->value) {
3c098014
ZJS
1228 /* Call ourselves again, for the group to parse. Note that we downgrade logging here
1229 * (i.e. take away the SECCOMP_PARSE_LOG flag) since any issues in the group table
1230 * are our own problem, not a problem in user configuration data and we shouldn't
1231 * pretend otherwise by complaining about them. */
58f6ab44 1232 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1233 if (r < 0)
1234 return r;
1235 }
1236 } else {
1237 int id;
1238
1239 id = seccomp_syscall_resolve_name(name);
1240 if (id == __NR_SCMP_ERROR) {
9e29ee40 1241 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1242 return -EINVAL;
13d92c63 1243
9e29ee40 1244 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
13d92c63
LP
1245 "Failed to parse system call, ignoring: %s", name);
1246 return 0;
898748d8
YW
1247 }
1248
3c098014
ZJS
1249 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove it
1250 * from the list. The entries in allow-list with non-negative error value will be handled
1251 * with SCMP_ACT_ERRNO() instead of the default action. */
68acc1af
YW
1252 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1253 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
898748d8
YW
1254 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1255 if (r < 0)
851ee70a
LW
1256 switch (r) {
1257 case -ENOMEM:
9e29ee40 1258 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
851ee70a 1259 case -EEXIST:
9d7fe7c6
LW
1260 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1261 break;
851ee70a
LW
1262 default:
1263 return r;
1264 }
898748d8
YW
1265 } else
1266 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1267 }
1268
1269 return 0;
1270}
1271
add00535 1272int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1273 uint32_t arch;
add00535
LP
1274 int r;
1275
f1d34068 1276 if (DEBUG_LOGGING) {
add00535
LP
1277 _cleanup_free_ char *s = NULL;
1278
86c2a9f1 1279 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1280 log_debug("Restricting namespace to: %s.", strna(s));
1281 }
1282
1283 /* NOOP? */
d7a0f1f4 1284 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
add00535
LP
1285 return 0;
1286
469830d1
LP
1287 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1288 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
add00535 1289
30868c1c 1290 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1291
1292 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1293 if (r < 0)
1294 return r;
1295
30193fe8
ZJS
1296 /* We cannot filter on individual flags to clone3(), and we need to disable the
1297 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1298 * users shall fall back to clone(), as if on an older kernel.
1299 *
1300 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1301 * https://github.com/moby/moby/issues/42680. */
1302
1303 r = seccomp_rule_add_exact(
1304 seccomp,
1305 SCMP_ACT_ERRNO(ENOSYS),
1306 SCMP_SYS(clone3),
1307 0);
1308 if (r < 0)
3c098014
ZJS
1309 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m",
1310 seccomp_arch_to_string(arch));
30193fe8 1311
469830d1 1312 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
3c098014
ZJS
1313 /* If every single kind of namespace shall be prohibited, then let's block the whole
1314 * setns() syscall altogether. */
469830d1
LP
1315 r = seccomp_rule_add_exact(
1316 seccomp,
1317 SCMP_ACT_ERRNO(EPERM),
1318 SCMP_SYS(setns),
1319 0);
1320 else
3c098014
ZJS
1321 /* Otherwise, block only the invocations with the appropriate flags in the loop
1322 * below, but also the special invocation with a zero flags argument, right here. */
469830d1
LP
1323 r = seccomp_rule_add_exact(
1324 seccomp,
1325 SCMP_ACT_ERRNO(EPERM),
1326 SCMP_SYS(setns),
1327 1,
1328 SCMP_A1(SCMP_CMP_EQ, 0));
1329 if (r < 0) {
3c098014
ZJS
1330 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1331 seccomp_arch_to_string(arch));
469830d1
LP
1332 continue;
1333 }
1334
241b1577 1335 for (unsigned i = 0; namespace_info[i].proc_name; i++) {
469830d1
LP
1336 unsigned long f;
1337
241b1577 1338 f = namespace_info[i].clone_flag;
d7a0f1f4 1339 if (FLAGS_SET(retain, f)) {
241b1577 1340 log_debug("Permitting %s.", namespace_info[i].proc_name);
469830d1
LP
1341 continue;
1342 }
1343
30868c1c 1344 log_trace("Blocking %s.", namespace_info[i].proc_name);
469830d1
LP
1345
1346 r = seccomp_rule_add_exact(
1347 seccomp,
1348 SCMP_ACT_ERRNO(EPERM),
1349 SCMP_SYS(unshare),
1350 1,
1351 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1352 if (r < 0) {
3c098014
ZJS
1353 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m",
1354 seccomp_arch_to_string(arch));
469830d1
LP
1355 break;
1356 }
1357
511ceb1f
ZJS
1358 /* On s390/s390x the first two parameters to clone are switched */
1359 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1360 r = seccomp_rule_add_exact(
1361 seccomp,
1362 SCMP_ACT_ERRNO(EPERM),
1363 SCMP_SYS(clone),
1364 1,
1365 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1366 else
1367 r = seccomp_rule_add_exact(
1368 seccomp,
1369 SCMP_ACT_ERRNO(EPERM),
1370 SCMP_SYS(clone),
1371 1,
1372 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1 1373 if (r < 0) {
3c098014
ZJS
1374 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m",
1375 seccomp_arch_to_string(arch));
469830d1
LP
1376 break;
1377 }
1378
1379 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1380 r = seccomp_rule_add_exact(
1381 seccomp,
1382 SCMP_ACT_ERRNO(EPERM),
1383 SCMP_SYS(setns),
1384 1,
1385 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1386 if (r < 0) {
3c098014
ZJS
1387 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1388 seccomp_arch_to_string(arch));
469830d1
LP
1389 break;
1390 }
1391 }
1392 }
1393 if (r < 0)
1394 continue;
1395
1396 r = seccomp_load(seccomp);
3c098014
ZJS
1397 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1398 return r;
1399 if (r < 0)
1400 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m",
1401 seccomp_arch_to_string(arch));
469830d1
LP
1402 }
1403
1404 return 0;
1405}
1406
1407int seccomp_protect_sysctl(void) {
1408 uint32_t arch;
1409 int r;
1410
1411 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1412 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1413
30868c1c 1414 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1415
f9252236
AJ
1416 if (IN_SET(arch,
1417 SCMP_ARCH_AARCH64,
f9d3fb6b
XW
1418#ifdef SCMP_ARCH_LOONGARCH64
1419 SCMP_ARCH_LOONGARCH64,
1420#endif
f9252236
AJ
1421#ifdef SCMP_ARCH_RISCV64
1422 SCMP_ARCH_RISCV64,
1423#endif
1424 SCMP_ARCH_X32
1425 ))
2e64e8f4
ZJS
1426 /* No _sysctl syscall */
1427 continue;
1428
469830d1
LP
1429 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1430 if (r < 0)
1431 return r;
1432
1433 r = seccomp_rule_add_exact(
add00535
LP
1434 seccomp,
1435 SCMP_ACT_ERRNO(EPERM),
469830d1 1436 SCMP_SYS(_sysctl),
add00535 1437 0);
469830d1 1438 if (r < 0) {
3c098014
ZJS
1439 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m",
1440 seccomp_arch_to_string(arch));
469830d1
LP
1441 continue;
1442 }
1443
1444 r = seccomp_load(seccomp);
3c098014
ZJS
1445 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1446 return r;
1447 if (r < 0)
1448 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m",
1449 seccomp_arch_to_string(arch));
469830d1
LP
1450 }
1451
1452 return 0;
1453}
1454
620dbdd2
KK
1455int seccomp_protect_syslog(void) {
1456 uint32_t arch;
1457 int r;
1458
1459 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1460 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1461
1462 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1463 if (r < 0)
1464 return r;
1465
1466 r = seccomp_rule_add_exact(
1467 seccomp,
1468 SCMP_ACT_ERRNO(EPERM),
1469 SCMP_SYS(syslog),
1470 0);
1471
1472 if (r < 0) {
1473 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1474 continue;
1475 }
1476
1477 r = seccomp_load(seccomp);
3c098014
ZJS
1478 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1479 return r;
1480 if (r < 0)
1481 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m",
1482 seccomp_arch_to_string(arch));
620dbdd2
KK
1483 }
1484
1485 return 0;
1486}
1487
6b000af4 1488int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1489 uint32_t arch;
1490 int r;
1491
1492 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1493 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1494 bool supported;
469830d1 1495
30868c1c 1496 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1497
9606bc4b
LP
1498 switch (arch) {
1499
1500 case SCMP_ARCH_X86_64:
1501 case SCMP_ARCH_X32:
1502 case SCMP_ARCH_ARM:
1503 case SCMP_ARCH_AARCH64:
f9d3fb6b
XW
1504#ifdef SCMP_ARCH_LOONGARCH64
1505 case SCMP_ARCH_LOONGARCH64:
1506#endif
f5aeac14
JC
1507 case SCMP_ARCH_MIPSEL64N32:
1508 case SCMP_ARCH_MIPS64N32:
1509 case SCMP_ARCH_MIPSEL64:
1510 case SCMP_ARCH_MIPS64:
f9252236
AJ
1511#ifdef SCMP_ARCH_RISCV64
1512 case SCMP_ARCH_RISCV64:
1513#endif
9606bc4b
LP
1514 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1515 supported = true;
1516 break;
1517
9606bc4b
LP
1518 case SCMP_ARCH_S390:
1519 case SCMP_ARCH_S390X:
da1921a5 1520 case SCMP_ARCH_X86:
f5aeac14
JC
1521 case SCMP_ARCH_MIPSEL:
1522 case SCMP_ARCH_MIPS:
344e6b62
SJ
1523#ifdef SCMP_ARCH_PARISC
1524 case SCMP_ARCH_PARISC:
1525#endif
1526#ifdef SCMP_ARCH_PARISC64
1527 case SCMP_ARCH_PARISC64:
1528#endif
d5923e38
ZJS
1529 case SCMP_ARCH_PPC:
1530 case SCMP_ARCH_PPC64:
1531 case SCMP_ARCH_PPC64LE:
9606bc4b
LP
1532 default:
1533 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1534 * don't know */
1535 supported = false;
1536 break;
1537 }
1538
1539 if (!supported)
1540 continue;
1541
469830d1
LP
1542 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1543 if (r < 0)
1544 return r;
1545
6b000af4 1546 if (allow_list) {
077e8fc0 1547 int first = 0, last = 0;
469830d1
LP
1548 void *afp;
1549
6b000af4
LP
1550 /* If this is an allow list, we first block the address families that are out of
1551 * range and then everything that is not in the set. First, we find the lowest and
1552 * highest address family in the set. */
469830d1 1553
90e74a66 1554 SET_FOREACH(afp, address_families) {
077e8fc0 1555 int af = PTR_TO_INT(afp);
469830d1
LP
1556
1557 if (af <= 0 || af >= af_max())
1558 continue;
1559
1560 if (first == 0 || af < first)
1561 first = af;
1562
1563 if (last == 0 || af > last)
1564 last = af;
1565 }
1566
1567 assert((first == 0) == (last == 0));
1568
1569 if (first == 0) {
1570
1571 /* No entries in the valid range, block everything */
1572 r = seccomp_rule_add_exact(
1573 seccomp,
1574 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1575 SCMP_SYS(socket),
1576 0);
1577 if (r < 0) {
3c098014
ZJS
1578 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1579 seccomp_arch_to_string(arch));
469830d1
LP
1580 continue;
1581 }
1582
1583 } else {
1584
1585 /* Block everything below the first entry */
1586 r = seccomp_rule_add_exact(
1587 seccomp,
1588 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1589 SCMP_SYS(socket),
1590 1,
1591 SCMP_A0(SCMP_CMP_LT, first));
1592 if (r < 0) {
3c098014
ZJS
1593 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1594 seccomp_arch_to_string(arch));
469830d1
LP
1595 continue;
1596 }
1597
1598 /* Block everything above the last entry */
1599 r = seccomp_rule_add_exact(
1600 seccomp,
1601 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1602 SCMP_SYS(socket),
1603 1,
1604 SCMP_A0(SCMP_CMP_GT, last));
1605 if (r < 0) {
3c098014
ZJS
1606 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1607 seccomp_arch_to_string(arch));
469830d1
LP
1608 continue;
1609 }
1610
1611 /* Block everything between the first and last entry */
077e8fc0 1612 for (int af = 1; af < af_max(); af++) {
469830d1
LP
1613
1614 if (set_contains(address_families, INT_TO_PTR(af)))
1615 continue;
1616
1617 r = seccomp_rule_add_exact(
1618 seccomp,
1619 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1620 SCMP_SYS(socket),
1621 1,
1622 SCMP_A0(SCMP_CMP_EQ, af));
1623 if (r < 0)
1624 break;
1625 }
469830d1 1626 if (r < 0) {
3c098014
ZJS
1627 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1628 seccomp_arch_to_string(arch));
469830d1
LP
1629 continue;
1630 }
1631 }
1632
1633 } else {
1634 void *af;
1635
6b000af4
LP
1636 /* If this is a deny list, then generate one rule for each address family that are
1637 * then combined in OR checks. */
469830d1 1638
90e74a66 1639 SET_FOREACH(af, address_families) {
469830d1
LP
1640 r = seccomp_rule_add_exact(
1641 seccomp,
1642 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1643 SCMP_SYS(socket),
1644 1,
1645 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1646 if (r < 0)
1647 break;
1648 }
469830d1 1649 if (r < 0) {
3c098014
ZJS
1650 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1651 seccomp_arch_to_string(arch));
469830d1
LP
1652 continue;
1653 }
1654 }
1655
1656 r = seccomp_load(seccomp);
3c098014
ZJS
1657 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1658 return r;
1659 if (r < 0)
1660 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m",
1661 seccomp_arch_to_string(arch));
469830d1
LP
1662 }
1663
1664 return 0;
1665}
1666
a9002749 1667int seccomp_restrict_realtime_full(int error_code) {
469830d1
LP
1668 static const int permitted_policies[] = {
1669 SCHED_OTHER,
1670 SCHED_BATCH,
1671 SCHED_IDLE,
1672 };
1673
1674 int r, max_policy = 0;
1675 uint32_t arch;
1676 unsigned i;
1677
a9002749
YW
1678 assert(error_code > 0);
1679
469830d1
LP
1680 /* Determine the highest policy constant we want to allow */
1681 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1682 if (permitted_policies[i] > max_policy)
1683 max_policy = permitted_policies[i];
1684
1685 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1686 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1687 int p;
1688
30868c1c 1689 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1690
1691 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1692 if (r < 0)
1693 return r;
1694
1695 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1696 * allow list. */
469830d1
LP
1697 for (p = 0; p < max_policy; p++) {
1698 bool good = false;
1699
6b000af4 1700 /* Check if this is in the allow list. */
469830d1
LP
1701 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1702 if (permitted_policies[i] == p) {
1703 good = true;
1704 break;
1705 }
1706
1707 if (good)
1708 continue;
1709
1710 /* Deny this policy */
1711 r = seccomp_rule_add_exact(
1712 seccomp,
a9002749 1713 SCMP_ACT_ERRNO(error_code),
469830d1
LP
1714 SCMP_SYS(sched_setscheduler),
1715 1,
1716 SCMP_A1(SCMP_CMP_EQ, p));
1717 if (r < 0) {
3c098014
ZJS
1718 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1719 seccomp_arch_to_string(arch));
469830d1
LP
1720 continue;
1721 }
1722 }
1723
6b000af4
LP
1724 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1725 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1726 r = seccomp_rule_add_exact(
add00535 1727 seccomp,
a9002749 1728 SCMP_ACT_ERRNO(error_code),
469830d1 1729 SCMP_SYS(sched_setscheduler),
add00535 1730 1,
469830d1
LP
1731 SCMP_A1(SCMP_CMP_GT, max_policy));
1732 if (r < 0) {
3c098014
ZJS
1733 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1734 seccomp_arch_to_string(arch));
469830d1
LP
1735 continue;
1736 }
add00535 1737
469830d1 1738 r = seccomp_load(seccomp);
3c098014
ZJS
1739 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1740 return r;
1741 if (r < 0)
1742 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m",
1743 seccomp_arch_to_string(arch));
469830d1
LP
1744 }
1745
1746 return 0;
1747}
1748
6dc66688
ZJS
1749static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1750 uint32_t arch,
1751 int nr,
14cb109d 1752 unsigned arg_cnt,
6dc66688
ZJS
1753 const struct scmp_arg_cmp arg) {
1754 int r;
1755
1756 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1757 if (r < 0) {
1758 _cleanup_free_ char *n = NULL;
1759
1760 n = seccomp_syscall_resolve_num_arch(arch, nr);
1761 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1762 strna(n),
1763 seccomp_arch_to_string(arch));
1764 }
1765
1766 return r;
1767}
1768
2a8d6e63 1769/* For known architectures, check that syscalls are indeed defined or not. */
f9d3fb6b 1770#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64)
2a8d6e63
ZJS
1771assert_cc(SCMP_SYS(shmget) > 0);
1772assert_cc(SCMP_SYS(shmat) > 0);
1773assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1774#endif
6dc66688 1775
469830d1
LP
1776int seccomp_memory_deny_write_execute(void) {
1777 uint32_t arch;
b069c2a3 1778 unsigned loaded = 0;
469830d1
LP
1779
1780 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1781 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1782 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1783
30868c1c 1784 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1785
8a50cf69
LP
1786 switch (arch) {
1787
bed4668d
CE
1788 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1789 * We ignore that here, which means there's still a way to get writable/executable
344e6b62
SJ
1790 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1791 *
1792 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1793 * on that front (kernel work done in 5.18).
1794 */
bed4668d 1795
8a50cf69 1796 case SCMP_ARCH_X86:
57311925 1797 case SCMP_ARCH_S390:
8a50cf69
LP
1798 filter_syscall = SCMP_SYS(mmap2);
1799 block_syscall = SCMP_SYS(mmap);
bed4668d 1800 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1801 break;
1802
63d00dfb 1803 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1804 case SCMP_ARCH_PPC64:
1805 case SCMP_ARCH_PPC64LE:
bed4668d 1806 case SCMP_ARCH_S390X:
2a8d6e63 1807 filter_syscall = SCMP_SYS(mmap);
bed4668d 1808 /* shmat multiplexed, see above */
8a50cf69
LP
1809 break;
1810
4278d1f5
ZJS
1811 case SCMP_ARCH_ARM:
1812 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1813 shmat_syscall = SCMP_SYS(shmat);
1814 break;
1815
8a50cf69
LP
1816 case SCMP_ARCH_X86_64:
1817 case SCMP_ARCH_X32:
79873bc8 1818 case SCMP_ARCH_AARCH64:
f9d3fb6b
XW
1819#ifdef SCMP_ARCH_LOONGARCH64
1820 case SCMP_ARCH_LOONGARCH64:
1821#endif
f9252236
AJ
1822#ifdef SCMP_ARCH_RISCV64
1823 case SCMP_ARCH_RISCV64:
1824#endif
f9d3fb6b 1825 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */
8a50cf69
LP
1826 shmat_syscall = SCMP_SYS(shmat);
1827 break;
1828
1829 /* Please add more definitions here, if you port systemd to other architectures! */
1830
f9d3fb6b 1831#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64)
8a50cf69
LP
1832#warning "Consider adding the right mmap() syscall definitions here!"
1833#endif
1834 }
1835
1836 /* Can't filter mmap() on this arch, then skip it */
1837 if (filter_syscall == 0)
1838 continue;
1839
469830d1
LP
1840 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1841 if (r < 0)
1842 return r;
1843
6dc66688
ZJS
1844 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1845 1,
1846 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1847 if (r < 0)
1848 continue;
8a50cf69
LP
1849
1850 if (block_syscall != 0) {
6dc66688
ZJS
1851 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1852 if (r < 0)
8a50cf69 1853 continue;
add00535 1854 }
a3be2849 1855
6dc66688
ZJS
1856 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1857 1,
b835eeb4
ZJS
1858 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1859 if (r < 0)
1860 continue;
1861
1862 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1863 1,
6dc66688
ZJS
1864 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1865 if (r < 0)
469830d1 1866 continue;
add00535 1867
67fb5f33 1868 if (shmat_syscall > 0) {
5ef3ed97 1869 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1870 1,
1871 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1872 if (r < 0)
8a50cf69 1873 continue;
469830d1
LP
1874 }
1875
1876 r = seccomp_load(seccomp);
3c098014
ZJS
1877 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1878 return r;
1879 if (r < 0)
b069c2a3
ZJS
1880 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1881 seccomp_arch_to_string(arch));
903659e7 1882 loaded++;
469830d1 1883 }
add00535 1884
903659e7 1885 if (loaded == 0)
b069c2a3 1886 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1887
1888 return loaded;
469830d1
LP
1889}
1890
1891int seccomp_restrict_archs(Set *archs) {
1892 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
469830d1 1893 int r;
65976868 1894 bool blocked_new = false;
469830d1
LP
1895
1896 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1897 * list.
1898 *
1899 * There are some qualifications. However the most important use is to stop processes from bypassing
1900 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1901 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1902
2428aaf8
AJ
1903 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1904 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1905 * to run a program with the restrictions applied. */
469830d1
LP
1906 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1907 if (!seccomp)
1908 return -ENOMEM;
1909
65976868
GDF
1910 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1911 uint32_t arch = seccomp_local_archs[i];
2428aaf8 1912
f833df38
BB
1913 /* See above comment, our "native" architecture is never blocked. */
1914 if (arch == seccomp_arch_native())
1915 continue;
1916
65976868
GDF
1917 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1918 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1919 continue;
2428aaf8 1920
65976868 1921 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
2428aaf8 1922
65976868
GDF
1923 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1924 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1925 * The important thing is that you can block the old 32-bit x86 syscalls.
1926 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1927 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1928 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1929
1930 if (block) {
1931 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1932 blocked_new = true;
1933 } else {
1934 r = seccomp_arch_add(seccomp, arch);
1935 if (r < 0 && r != -EEXIST)
1936 return r;
1937 }
add00535
LP
1938 }
1939
65976868
GDF
1940 /* All architectures that will be blocked by the seccomp program were
1941 * already blocked. */
1942 if (!blocked_new)
1943 return 0;
1944
469830d1
LP
1945 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1946 if (r < 0)
1947 return r;
add00535 1948
1c6af69b 1949 r = seccomp_load(seccomp);
3c098014
ZJS
1950 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1951 return r;
1952 if (r < 0)
1c6af69b
LP
1953 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1954
1955 return 0;
a3be2849 1956}
b16bd535 1957
de7fef4b
ZJS
1958int parse_syscall_archs(char **l, Set **ret_archs) {
1959 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
1960 int r;
1961
1962 assert(l);
de7fef4b 1963 assert(ret_archs);
b16bd535
YW
1964
1965 STRV_FOREACH(s, l) {
1966 uint32_t a;
1967
1968 r = seccomp_arch_from_string(*s, &a);
1969 if (r < 0)
1970 return -EINVAL;
1971
de7fef4b 1972 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
1973 if (r < 0)
1974 return -ENOMEM;
1975 }
1976
de7fef4b 1977 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
1978 return 0;
1979}
165a31c0 1980
8cfa775f 1981int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1982 int r;
1983
1984 assert(set);
1985
1986 NULSTR_FOREACH(i, set->value) {
1987
1988 if (i[0] == '@') {
1989 const SyscallFilterSet *more;
1990
1991 more = syscall_filter_set_find(i);
1992 if (!more)
1993 return -ENXIO;
1994
165a31c0
LP
1995 r = seccomp_filter_set_add(filter, add, more);
1996 if (r < 0)
1997 return r;
1998 } else {
1999 int id;
2000
2001 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
2002 if (id == __NR_SCMP_ERROR) {
2003 log_debug("Couldn't resolve system call, ignoring: %s", i);
2004 continue;
2005 }
165a31c0
LP
2006
2007 if (add) {
8cfa775f 2008 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
2009 if (r < 0)
2010 return r;
2011 } else
8cfa775f 2012 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
2013 }
2014 }
2015
2016 return 0;
2017}
78e864e5
TM
2018
2019int seccomp_lock_personality(unsigned long personality) {
72eafe71 2020 uint32_t arch;
78e864e5
TM
2021 int r;
2022
72eafe71
LP
2023 if (personality >= PERSONALITY_INVALID)
2024 return -EINVAL;
78e864e5 2025
72eafe71
LP
2026 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2027 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 2028
72eafe71
LP
2029 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2030 if (r < 0)
2031 return r;
2032
2033 r = seccomp_rule_add_exact(
2034 seccomp,
2035 SCMP_ACT_ERRNO(EPERM),
2036 SCMP_SYS(personality),
2037 1,
2038 SCMP_A0(SCMP_CMP_NE, personality));
448ac526 2039 if (r < 0) {
3c098014
ZJS
2040 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
2041 seccomp_arch_to_string(arch));
448ac526
LP
2042 continue;
2043 }
72eafe71
LP
2044
2045 r = seccomp_load(seccomp);
3c098014
ZJS
2046 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2047 return r;
2048 if (r < 0)
2049 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m",
2050 seccomp_arch_to_string(arch));
72eafe71
LP
2051 }
2052
2053 return 0;
78e864e5 2054}
aecd5ac6
TM
2055
2056int seccomp_protect_hostname(void) {
2057 uint32_t arch;
2058 int r;
2059
2060 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2061 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2062
2063 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2064 if (r < 0)
2065 return r;
2066
2067 r = seccomp_rule_add_exact(
2068 seccomp,
2069 SCMP_ACT_ERRNO(EPERM),
2070 SCMP_SYS(sethostname),
2071 0);
9e6e543c 2072 if (r < 0) {
3c098014
ZJS
2073 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m",
2074 seccomp_arch_to_string(arch));
aecd5ac6 2075 continue;
9e6e543c 2076 }
aecd5ac6
TM
2077
2078 r = seccomp_rule_add_exact(
2079 seccomp,
2080 SCMP_ACT_ERRNO(EPERM),
2081 SCMP_SYS(setdomainname),
2082 0);
9e6e543c 2083 if (r < 0) {
3c098014
ZJS
2084 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m",
2085 seccomp_arch_to_string(arch));
aecd5ac6 2086 continue;
9e6e543c 2087 }
aecd5ac6
TM
2088
2089 r = seccomp_load(seccomp);
3c098014
ZJS
2090 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2091 return r;
2092 if (r < 0)
2093 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m",
2094 seccomp_arch_to_string(arch));
aecd5ac6
TM
2095 }
2096
2097 return 0;
2098}
3c27973b 2099
da4dc9a6
ZJS
2100static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
2101 /* Checks the mode_t parameter of the following system calls:
2102 *
8b45281d 2103 * → chmod() + fchmod() + fchmodat() + fchmodat2()
da4dc9a6
ZJS
2104 * → open() + creat() + openat()
2105 * → mkdir() + mkdirat()
2106 * → mknod() + mknodat()
2107 *
2108 * Returns error if *everything* failed, and 0 otherwise.
2109 */
6d95e7d9 2110 int r;
da4dc9a6
ZJS
2111 bool any = false;
2112
2113 r = seccomp_rule_add_exact(
2114 seccomp,
2115 SCMP_ACT_ERRNO(EPERM),
2116 SCMP_SYS(chmod),
2117 1,
2118 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2119 if (r < 0)
2120 log_debug_errno(r, "Failed to add filter for chmod: %m");
2121 else
2122 any = true;
2123
2124 r = seccomp_rule_add_exact(
2125 seccomp,
2126 SCMP_ACT_ERRNO(EPERM),
2127 SCMP_SYS(fchmod),
2128 1,
2129 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2130 if (r < 0)
2131 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2132 else
2133 any = true;
2134
2135 r = seccomp_rule_add_exact(
2136 seccomp,
2137 SCMP_ACT_ERRNO(EPERM),
2138 SCMP_SYS(fchmodat),
2139 1,
2140 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2141 if (r < 0)
2142 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2143 else
2144 any = true;
2145
8b45281d
AM
2146#if defined(__SNR_fchmodat2)
2147 r = seccomp_rule_add_exact(
2148 seccomp,
2149 SCMP_ACT_ERRNO(EPERM),
2150 SCMP_SYS(fchmodat2),
2151 1,
2152 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2153#else
2154 /* It looks like this libseccomp does not know about fchmodat2().
2155 * Pretend the fchmodat2() system call is not supported at all,
2156 * regardless of the kernel version. */
2157 r = seccomp_rule_add_exact(
2158 seccomp,
2159 SCMP_ACT_ERRNO(ENOSYS),
2160 __NR_fchmodat2,
2161 0);
2162#endif
2163 if (r < 0)
2164 log_debug_errno(r, "Failed to add filter for fchmodat2: %m");
2165 else
2166 any = true;
2167
da4dc9a6
ZJS
2168 r = seccomp_rule_add_exact(
2169 seccomp,
2170 SCMP_ACT_ERRNO(EPERM),
2171 SCMP_SYS(mkdir),
2172 1,
2173 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2174 if (r < 0)
2175 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2176 else
2177 any = true;
2178
2179 r = seccomp_rule_add_exact(
2180 seccomp,
2181 SCMP_ACT_ERRNO(EPERM),
2182 SCMP_SYS(mkdirat),
2183 1,
2184 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2185 if (r < 0)
2186 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2187 else
2188 any = true;
2189
2190 r = seccomp_rule_add_exact(
2191 seccomp,
2192 SCMP_ACT_ERRNO(EPERM),
2193 SCMP_SYS(mknod),
2194 1,
2195 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2196 if (r < 0)
2197 log_debug_errno(r, "Failed to add filter for mknod: %m");
2198 else
2199 any = true;
2200
2201 r = seccomp_rule_add_exact(
2202 seccomp,
2203 SCMP_ACT_ERRNO(EPERM),
2204 SCMP_SYS(mknodat),
2205 1,
2206 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2207 if (r < 0)
2208 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2209 else
2210 any = true;
2211
da4dc9a6
ZJS
2212 r = seccomp_rule_add_exact(
2213 seccomp,
2214 SCMP_ACT_ERRNO(EPERM),
2215 SCMP_SYS(open),
2216 2,
2217 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2218 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2219 if (r < 0)
2220 log_debug_errno(r, "Failed to add filter for open: %m");
2221 else
2222 any = true;
da4dc9a6
ZJS
2223
2224 r = seccomp_rule_add_exact(
2225 seccomp,
2226 SCMP_ACT_ERRNO(EPERM),
2227 SCMP_SYS(openat),
2228 2,
2229 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2230 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2231 if (r < 0)
2232 log_debug_errno(r, "Failed to add filter for openat: %m");
2233 else
2234 any = true;
2235
ecc04067
LP
2236#if defined(__SNR_openat2)
2237 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2238 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2239 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
57353d29
MG
2240 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2241 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2242 * to call open() or openat() instead. We can properly enforce policy for those functions. */
ecc04067
LP
2243 r = seccomp_rule_add_exact(
2244 seccomp,
57353d29 2245 SCMP_ACT_ERRNO(ENOSYS),
ecc04067
LP
2246 SCMP_SYS(openat2),
2247 0);
2248 if (r < 0)
2249 log_debug_errno(r, "Failed to add filter for openat2: %m");
2250 else
2251 any = true;
2252#endif
2253
da4dc9a6
ZJS
2254 r = seccomp_rule_add_exact(
2255 seccomp,
2256 SCMP_ACT_ERRNO(EPERM),
2257 SCMP_SYS(creat),
2258 1,
2259 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2260 if (r < 0)
2261 log_debug_errno(r, "Failed to add filter for creat: %m");
2262 else
2263 any = true;
2264
2265 return any ? 0 : r;
2266}
2267
3c27973b
LP
2268int seccomp_restrict_suid_sgid(void) {
2269 uint32_t arch;
da4dc9a6 2270 int r, k;
3c27973b
LP
2271
2272 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2273 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2274
2275 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2276 if (r < 0)
2277 return r;
2278
da4dc9a6
ZJS
2279 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2280 if (r < 0)
3c098014
ZJS
2281 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m",
2282 seccomp_arch_to_string(arch));
3c27973b 2283
da4dc9a6
ZJS
2284 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2285 if (k < 0)
a539314a 2286 log_debug_errno(k, "Failed to add sgid rule for architecture %s, ignoring: %m",
3c098014 2287 seccomp_arch_to_string(arch));
3c27973b 2288
da4dc9a6 2289 if (r < 0 && k < 0)
3c27973b 2290 continue;
3c27973b
LP
2291
2292 r = seccomp_load(seccomp);
3c098014
ZJS
2293 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2294 return r;
2295 if (r < 0)
2296 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m",
2297 seccomp_arch_to_string(arch));
3c27973b
LP
2298 }
2299
2300 return 0;
2301}
915fb324
LP
2302
2303uint32_t scmp_act_kill_process(void) {
2304
2305 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2306 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2307 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2308 * for single-threaded apps does the right thing. */
2309
2310#ifdef SCMP_ACT_KILL_PROCESS
2311 if (seccomp_api_get() >= 3)
2312 return SCMP_ACT_KILL_PROCESS;
2313#endif
2314
2315 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2316}
22eadc28
YW
2317
2318int parse_syscall_and_errno(const char *in, char **name, int *error) {
2319 _cleanup_free_ char *n = NULL;
2320 char *p;
2321 int e = -1;
2322
2323 assert(in);
2324 assert(name);
2325 assert(error);
2326
2327 /*
2328 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2329 * If errno is omitted, then error is set to -1.
2330 * Empty syscall name is not allowed.
2331 * Here, we do not check that the syscall name is valid or not.
2332 */
2333
2334 p = strchr(in, ':');
2335 if (p) {
2336 e = seccomp_parse_errno_or_action(p + 1);
2337 if (e < 0)
2338 return e;
2339
2340 n = strndup(in, p - in);
2341 } else
2342 n = strdup(in);
2343
2344 if (!n)
2345 return -ENOMEM;
2346
2347 if (isempty(n))
2348 return -EINVAL;
2349
2350 *error = e;
2351 *name = TAKE_PTR(n);
2352
2353 return 0;
2354}
4a4654e0
LP
2355
2356static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2357 bool any = false;
2358 int r;
2359
2360 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2361 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2362
4a4654e0
LP
2363 r = seccomp_rule_add_exact(
2364 seccomp,
2365 SCMP_ACT_ERRNO(EINVAL),
2366 SCMP_SYS(open),
2367 1,
2368 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2369 if (r < 0)
2370 log_debug_errno(r, "Failed to add filter for open: %m");
2371 else
2372 any = true;
4a4654e0
LP
2373
2374 r = seccomp_rule_add_exact(
2375 seccomp,
2376 SCMP_ACT_ERRNO(EINVAL),
2377 SCMP_SYS(openat),
2378 1,
2379 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2380 if (r < 0)
2381 log_debug_errno(r, "Failed to add filter for openat: %m");
2382 else
2383 any = true;
2384
2385#if defined(__SNR_openat2)
2386 /* The new openat2() system call can't be filtered sensibly, see above. */
2387 r = seccomp_rule_add_exact(
2388 seccomp,
2389 SCMP_ACT_ERRNO(ENOSYS),
2390 SCMP_SYS(openat2),
2391 0);
2392 if (r < 0)
2393 log_debug_errno(r, "Failed to add filter for openat2: %m");
2394 else
2395 any = true;
2396#endif
2397
2398 return any ? 0 : r;
2399}
2400
2401int seccomp_suppress_sync(void) {
2402 uint32_t arch;
2403 int r;
2404
2405 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2406 * manageable, and also masks O_SYNC/O_DSYNC */
2407
2408 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2409 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
4a4654e0
LP
2410
2411 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2412 if (r < 0)
2413 return r;
2414
2415 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2416 int id;
2417
2418 id = seccomp_syscall_resolve_name(c);
2419 if (id == __NR_SCMP_ERROR) {
2420 log_debug("System call %s is not known, ignoring.", c);
2421 continue;
2422 }
2423
2424 r = seccomp_rule_add_exact(
2425 seccomp,
2426 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2427 id,
2428 0);
2429 if (r < 0)
2430 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2431 }
2432
2433 (void) block_open_flag(seccomp, O_SYNC);
2434#if O_DSYNC != O_SYNC
2435 (void) block_open_flag(seccomp, O_DSYNC);
2436#endif
2437
2438 r = seccomp_load(seccomp);
3c098014
ZJS
2439 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2440 return r;
2441 if (r < 0)
2442 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m",
2443 seccomp_arch_to_string(arch));
4a4654e0
LP
2444 }
2445
2446 return 0;
2447}