]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
Merge pull request #31524 from poettering/secure-getenv-naming-fix
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
a8fbdf54 6#include <stddef.h>
469830d1 7#include <sys/mman.h>
d347d902 8#include <sys/prctl.h>
469830d1 9#include <sys/shm.h>
3c27973b 10#include <sys/stat.h>
57183d11 11
e83156c2
YW
12/* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13#include "missing_syscall_def.h"
14#include <seccomp.h>
15
469830d1 16#include "af-list.h"
add00535 17#include "alloc-util.h"
44aaddad 18#include "env-util.h"
d8b4d14d 19#include "errno-list.h"
a8fbdf54 20#include "macro.h"
241b1577 21#include "namespace-util.h"
add00535 22#include "nsflags.h"
d8b4d14d 23#include "nulstr-util.h"
78e864e5 24#include "process-util.h"
cf0fbc49 25#include "seccomp-util.h"
b16bd535 26#include "set.h"
07630cea 27#include "string-util.h"
b16bd535 28#include "strv.h"
469830d1 29
65976868
GDF
30/* This array will be modified at runtime as seccomp_restrict_archs is called. */
31uint32_t seccomp_local_archs[] = {
469830d1 32
6b000af4 33 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
34
35#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
36 SCMP_ARCH_X86,
37 SCMP_ARCH_X86_64,
f2d9751c
LP
38 SCMP_ARCH_X32, /* native */
39#elif defined(__x86_64__) && !defined(__ILP32__)
40 SCMP_ARCH_X86,
469830d1 41 SCMP_ARCH_X32,
f2d9751c
LP
42 SCMP_ARCH_X86_64, /* native */
43#elif defined(__i386__)
44 SCMP_ARCH_X86,
45#elif defined(__aarch64__)
469830d1 46 SCMP_ARCH_ARM,
f2d9751c
LP
47 SCMP_ARCH_AARCH64, /* native */
48#elif defined(__arm__)
49 SCMP_ARCH_ARM,
f9d3fb6b
XW
50#elif defined(__loongarch_lp64)
51 SCMP_ARCH_LOONGARCH64,
f2d9751c
LP
52#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
53 SCMP_ARCH_MIPSEL,
54 SCMP_ARCH_MIPS, /* native */
55#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 56 SCMP_ARCH_MIPS,
f2d9751c
LP
57 SCMP_ARCH_MIPSEL, /* native */
58#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS,
61 SCMP_ARCH_MIPSEL64N32,
469830d1 62 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
63 SCMP_ARCH_MIPSEL64,
64 SCMP_ARCH_MIPS64, /* native */
65#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPS,
469830d1 67 SCMP_ARCH_MIPSEL,
f2d9751c
LP
68 SCMP_ARCH_MIPS64N32,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64,
71 SCMP_ARCH_MIPSEL64, /* native */
72#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS,
469830d1 75 SCMP_ARCH_MIPSEL64,
f2d9751c 76 SCMP_ARCH_MIPS64,
469830d1 77 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
78 SCMP_ARCH_MIPS64N32, /* native */
79#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPS,
81 SCMP_ARCH_MIPSEL,
82 SCMP_ARCH_MIPS64,
83 SCMP_ARCH_MIPSEL64,
84 SCMP_ARCH_MIPS64N32,
85 SCMP_ARCH_MIPSEL64N32, /* native */
344e6b62
SJ
86#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
87 SCMP_ARCH_PARISC,
88 SCMP_ARCH_PARISC64, /* native */
89#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
90 SCMP_ARCH_PARISC,
f2d9751c 91#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 92 SCMP_ARCH_PPC,
469830d1 93 SCMP_ARCH_PPC64LE,
f2d9751c
LP
94 SCMP_ARCH_PPC64, /* native */
95#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
96 SCMP_ARCH_PPC,
97 SCMP_ARCH_PPC64,
98 SCMP_ARCH_PPC64LE, /* native */
99#elif defined(__powerpc__)
100 SCMP_ARCH_PPC,
f9252236
AJ
101#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
102 SCMP_ARCH_RISCV64,
f2d9751c
LP
103#elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106#elif defined(__s390__)
469830d1 107 SCMP_ARCH_S390,
469830d1 108#endif
65976868 109 SECCOMP_LOCAL_ARCH_END
469830d1 110 };
57183d11
LP
111
112const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
57183d11 117
79893116 118 switch (c) {
aa34055f 119 case SCMP_ARCH_NATIVE:
57183d11 120 return "native";
aa34055f 121 case SCMP_ARCH_X86:
57183d11 122 return "x86";
aa34055f 123 case SCMP_ARCH_X86_64:
57183d11 124 return "x86-64";
aa34055f 125 case SCMP_ARCH_X32:
57183d11 126 return "x32";
aa34055f 127 case SCMP_ARCH_ARM:
57183d11 128 return "arm";
aa34055f
ZJS
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
f9d3fb6b
XW
131#ifdef SCMP_ARCH_LOONGARCH64
132 case SCMP_ARCH_LOONGARCH64:
133 return "loongarch64";
134#endif
aa34055f
ZJS
135 case SCMP_ARCH_MIPS:
136 return "mips";
137 case SCMP_ARCH_MIPS64:
138 return "mips64";
139 case SCMP_ARCH_MIPS64N32:
140 return "mips64-n32";
141 case SCMP_ARCH_MIPSEL:
142 return "mips-le";
143 case SCMP_ARCH_MIPSEL64:
144 return "mips64-le";
145 case SCMP_ARCH_MIPSEL64N32:
146 return "mips64-le-n32";
344e6b62
SJ
147#ifdef SCMP_ARCH_PARISC
148 case SCMP_ARCH_PARISC:
149 return "parisc";
150#endif
151#ifdef SCMP_ARCH_PARISC64
152 case SCMP_ARCH_PARISC64:
153 return "parisc64";
154#endif
aa34055f
ZJS
155 case SCMP_ARCH_PPC:
156 return "ppc";
157 case SCMP_ARCH_PPC64:
158 return "ppc64";
159 case SCMP_ARCH_PPC64LE:
160 return "ppc64-le";
f9252236
AJ
161#ifdef SCMP_ARCH_RISCV64
162 case SCMP_ARCH_RISCV64:
163 return "riscv64";
164#endif
aa34055f 165 case SCMP_ARCH_S390:
6abfd303 166 return "s390";
aa34055f 167 case SCMP_ARCH_S390X:
6abfd303 168 return "s390x";
aa34055f
ZJS
169 default:
170 return NULL;
171 }
57183d11
LP
172}
173
174int seccomp_arch_from_string(const char *n, uint32_t *ret) {
175 if (!n)
176 return -EINVAL;
177
178 assert(ret);
179
180 if (streq(n, "native"))
181 *ret = SCMP_ARCH_NATIVE;
182 else if (streq(n, "x86"))
183 *ret = SCMP_ARCH_X86;
184 else if (streq(n, "x86-64"))
185 *ret = SCMP_ARCH_X86_64;
186 else if (streq(n, "x32"))
187 *ret = SCMP_ARCH_X32;
188 else if (streq(n, "arm"))
189 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
190 else if (streq(n, "arm64"))
191 *ret = SCMP_ARCH_AARCH64;
f9d3fb6b
XW
192#ifdef SCMP_ARCH_LOONGARCH64
193 else if (streq(n, "loongarch64"))
194 *ret = SCMP_ARCH_LOONGARCH64;
195#endif
aa34055f
ZJS
196 else if (streq(n, "mips"))
197 *ret = SCMP_ARCH_MIPS;
198 else if (streq(n, "mips64"))
199 *ret = SCMP_ARCH_MIPS64;
200 else if (streq(n, "mips64-n32"))
201 *ret = SCMP_ARCH_MIPS64N32;
202 else if (streq(n, "mips-le"))
203 *ret = SCMP_ARCH_MIPSEL;
204 else if (streq(n, "mips64-le"))
205 *ret = SCMP_ARCH_MIPSEL64;
206 else if (streq(n, "mips64-le-n32"))
207 *ret = SCMP_ARCH_MIPSEL64N32;
344e6b62
SJ
208#ifdef SCMP_ARCH_PARISC
209 else if (streq(n, "parisc"))
210 *ret = SCMP_ARCH_PARISC;
211#endif
212#ifdef SCMP_ARCH_PARISC64
213 else if (streq(n, "parisc64"))
214 *ret = SCMP_ARCH_PARISC64;
215#endif
aa34055f
ZJS
216 else if (streq(n, "ppc"))
217 *ret = SCMP_ARCH_PPC;
218 else if (streq(n, "ppc64"))
219 *ret = SCMP_ARCH_PPC64;
220 else if (streq(n, "ppc64-le"))
221 *ret = SCMP_ARCH_PPC64LE;
f9252236
AJ
222#ifdef SCMP_ARCH_RISCV64
223 else if (streq(n, "riscv64"))
224 *ret = SCMP_ARCH_RISCV64;
225#endif
6abfd303
HB
226 else if (streq(n, "s390"))
227 *ret = SCMP_ARCH_S390;
228 else if (streq(n, "s390x"))
229 *ret = SCMP_ARCH_S390X;
57183d11
LP
230 else
231 return -EINVAL;
232
233 return 0;
234}
e9642be2 235
469830d1 236int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 237 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
238 int r;
239
469830d1
LP
240 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
241 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
242
243 seccomp = seccomp_init(default_action);
244 if (!seccomp)
245 return -ENOMEM;
246
469830d1
LP
247 if (arch != SCMP_ARCH_NATIVE &&
248 arch != seccomp_arch_native()) {
249
1b52793d 250 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 251 if (r < 0)
b4eaa6cc 252 return r;
469830d1 253
1b52793d 254 r = seccomp_arch_add(seccomp, arch);
469830d1 255 if (r < 0)
b4eaa6cc 256 return r;
469830d1
LP
257
258 assert(seccomp_arch_exist(seccomp, arch) >= 0);
259 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
260 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
261 } else {
262 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
263 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
264 }
265
266 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 267 if (r < 0)
b4eaa6cc 268 return r;
8d7b0c8f
LP
269
270 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
271 if (r < 0)
b4eaa6cc 272 return r;
8d7b0c8f 273
44aaddad
SD
274#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
275 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
276 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
277 if (r < 0)
278 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
279 }
280#endif
281
b4eaa6cc 282 *ret = TAKE_PTR(seccomp);
8d7b0c8f 283 return 0;
8d7b0c8f
LP
284}
285
d347d902 286static bool is_basic_seccomp_available(void) {
4d5bd50a 287 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
288}
289
290static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
291 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
292 errno == EFAULT;
d347d902
FS
293}
294
83f12b27 295bool is_seccomp_available(void) {
83f12b27 296 static int cached_enabled = -1;
4d5bd50a 297
ce8f6d47
LP
298 if (cached_enabled < 0) {
299 int b;
300
efb9b3ba 301 b = secure_getenv_bool("SYSTEMD_SECCOMP");
ce8f6d47
LP
302 if (b != 0) {
303 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
304 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
305
306 cached_enabled =
307 is_basic_seccomp_available() &&
308 is_seccomp_filter_available();
309 } else
310 cached_enabled = false;
311 }
4d5bd50a 312
83f12b27
FS
313 return cached_enabled;
314}
315
8130926d 316const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 317 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 318 .name = "@default",
d5efc18b 319 .help = "System calls that are always permitted",
40eb6a80 320 .value =
5f02870a 321 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
5abede32 322 "brk\0"
8e24b1d2 323 "cacheflush\0"
40eb6a80 324 "clock_getres\0"
6ca67710 325 "clock_getres_time64\0"
40eb6a80 326 "clock_gettime\0"
6ca67710 327 "clock_gettime64\0"
40eb6a80 328 "clock_nanosleep\0"
6ca67710 329 "clock_nanosleep_time64\0"
40eb6a80
ZJS
330 "execve\0"
331 "exit\0"
332 "exit_group\0"
e41b0f42 333 "futex\0"
6ca67710 334 "futex_time64\0"
76e86b8d 335 "futex_waitv\0"
e41b0f42
LP
336 "get_robust_list\0"
337 "get_thread_area\0"
09d3020b
DH
338 "getegid\0"
339 "getegid32\0"
340 "geteuid\0"
341 "geteuid32\0"
342 "getgid\0"
343 "getgid32\0"
344 "getgroups\0"
345 "getgroups32\0"
346 "getpgid\0"
347 "getpgrp\0"
348 "getpid\0"
349 "getppid\0"
14f4b1b5 350 "getrandom\0"
09d3020b
DH
351 "getresgid\0"
352 "getresgid32\0"
353 "getresuid\0"
354 "getresuid32\0"
40eb6a80 355 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
356 "getsid\0"
357 "gettid\0"
40eb6a80 358 "gettimeofday\0"
09d3020b
DH
359 "getuid\0"
360 "getuid32\0"
e41b0f42 361 "membarrier\0"
5abede32
LP
362 "mmap\0"
363 "mmap2\0"
47286254 364 "mprotect\0"
11b9105d 365 "munmap\0"
40eb6a80
ZJS
366 "nanosleep\0"
367 "pause\0"
4c3a9176 368 "prlimit64\0"
e41b0f42 369 "restart_syscall\0"
09925036 370 "riscv_flush_icache\0"
ca15fc48 371 "riscv_hwprobe\0"
6fee3be0 372 "rseq\0"
40eb6a80 373 "rt_sigreturn\0"
7df660e4 374 "sched_getaffinity\0"
8f44de08 375 "sched_yield\0"
e41b0f42
LP
376 "set_robust_list\0"
377 "set_thread_area\0"
378 "set_tid_address\0"
ce5faeac 379 "set_tls\0"
40eb6a80
ZJS
380 "sigreturn\0"
381 "time\0"
4c3a9176 382 "ugetrlimit\0"
40eb6a80 383 },
44898c53
LP
384 [SYSCALL_FILTER_SET_AIO] = {
385 .name = "@aio",
386 .help = "Asynchronous IO",
387 .value =
388 "io_cancel\0"
389 "io_destroy\0"
390 "io_getevents\0"
a05cfe23 391 "io_pgetevents\0"
6ca67710 392 "io_pgetevents_time64\0"
44898c53
LP
393 "io_setup\0"
394 "io_submit\0"
9e486265
LP
395 "io_uring_enter\0"
396 "io_uring_register\0"
397 "io_uring_setup\0"
44898c53 398 },
133ddbbe 399 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 400 .name = "@basic-io",
d5efc18b 401 .help = "Basic IO",
133ddbbe 402 .value =
648a0ed0 403 "_llseek\0"
133ddbbe 404 "close\0"
6ea0d25c 405 "close_range\0"
648a0ed0 406 "dup\0"
133ddbbe
LP
407 "dup2\0"
408 "dup3\0"
133ddbbe
LP
409 "lseek\0"
410 "pread64\0"
411 "preadv\0"
44898c53 412 "preadv2\0"
133ddbbe
LP
413 "pwrite64\0"
414 "pwritev\0"
44898c53 415 "pwritev2\0"
133ddbbe
LP
416 "read\0"
417 "readv\0"
418 "write\0"
419 "writev\0"
420 },
44898c53
LP
421 [SYSCALL_FILTER_SET_CHOWN] = {
422 .name = "@chown",
423 .help = "Change ownership of files and directories",
424 .value =
425 "chown\0"
426 "chown32\0"
427 "fchown\0"
428 "fchown32\0"
429 "fchownat\0"
430 "lchown\0"
431 "lchown32\0"
432 },
8130926d 433 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 434 .name = "@clock",
d5efc18b 435 .help = "Change the system time",
201c1cc2
TM
436 .value =
437 "adjtimex\0"
1f9ac68b 438 "clock_adjtime\0"
6ca67710 439 "clock_adjtime64\0"
1f9ac68b 440 "clock_settime\0"
6ca67710 441 "clock_settime64\0"
201c1cc2 442 "settimeofday\0"
8130926d
LP
443 },
444 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 445 .name = "@cpu-emulation",
d5efc18b 446 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
447 .value =
448 "modify_ldt\0"
449 "subpage_prot\0"
450 "switch_endian\0"
451 "vm86\0"
452 "vm86old\0"
8130926d
LP
453 },
454 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 455 .name = "@debug",
d5efc18b 456 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
457 .value =
458 "lookup_dcookie\0"
459 "perf_event_open\0"
8270e3d8 460 "pidfd_getfd\0"
1f9ac68b
LP
461 "ptrace\0"
462 "rtas\0"
463 "s390_runtime_instr\0"
464 "sys_debug_setcontext\0"
8130926d 465 },
1a1b13c9
LP
466 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
467 .name = "@file-system",
468 .help = "File system operations",
469 .value =
470 "access\0"
471 "chdir\0"
472 "chmod\0"
473 "close\0"
474 "creat\0"
475 "faccessat\0"
bcf08acb 476 "faccessat2\0"
1a1b13c9
LP
477 "fallocate\0"
478 "fchdir\0"
479 "fchmod\0"
480 "fchmodat\0"
6e10405a 481 "fchmodat2\0"
1a1b13c9 482 "fcntl\0"
ceaa6aa7 483 "fcntl64\0"
1a1b13c9
LP
484 "fgetxattr\0"
485 "flistxattr\0"
ceaa6aa7 486 "fremovexattr\0"
1a1b13c9 487 "fsetxattr\0"
1a1b13c9 488 "fstat\0"
ceaa6aa7 489 "fstat64\0"
1a1b13c9 490 "fstatat64\0"
1a1b13c9 491 "fstatfs\0"
ceaa6aa7 492 "fstatfs64\0"
1a1b13c9 493 "ftruncate\0"
ceaa6aa7 494 "ftruncate64\0"
1a1b13c9
LP
495 "futimesat\0"
496 "getcwd\0"
1a1b13c9 497 "getdents\0"
ceaa6aa7 498 "getdents64\0"
1a1b13c9
LP
499 "getxattr\0"
500 "inotify_add_watch\0"
ceaa6aa7 501 "inotify_init\0"
1a1b13c9
LP
502 "inotify_init1\0"
503 "inotify_rm_watch\0"
504 "lgetxattr\0"
505 "link\0"
506 "linkat\0"
507 "listxattr\0"
508 "llistxattr\0"
509 "lremovexattr\0"
510 "lsetxattr\0"
1a1b13c9 511 "lstat\0"
ceaa6aa7 512 "lstat64\0"
1a1b13c9
LP
513 "mkdir\0"
514 "mkdirat\0"
515 "mknod\0"
516 "mknodat\0"
1a1b13c9 517 "newfstatat\0"
ceaa6aa7
LP
518 "oldfstat\0"
519 "oldlstat\0"
520 "oldstat\0"
1a1b13c9
LP
521 "open\0"
522 "openat\0"
8270e3d8 523 "openat2\0"
1a1b13c9
LP
524 "readlink\0"
525 "readlinkat\0"
526 "removexattr\0"
527 "rename\0"
1a1b13c9 528 "renameat\0"
ceaa6aa7 529 "renameat2\0"
1a1b13c9
LP
530 "rmdir\0"
531 "setxattr\0"
1a1b13c9 532 "stat\0"
ceaa6aa7 533 "stat64\0"
1a1b13c9 534 "statfs\0"
ceaa6aa7 535 "statfs64\0"
a4135a74 536 "statx\0"
1a1b13c9
LP
537 "symlink\0"
538 "symlinkat\0"
1a1b13c9 539 "truncate\0"
ceaa6aa7 540 "truncate64\0"
1a1b13c9
LP
541 "unlink\0"
542 "unlinkat\0"
ceaa6aa7 543 "utime\0"
1a1b13c9 544 "utimensat\0"
6ca67710 545 "utimensat_time64\0"
1a1b13c9
LP
546 "utimes\0"
547 },
8130926d 548 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 549 .name = "@io-event",
d5efc18b 550 .help = "Event loop system calls",
201c1cc2
TM
551 .value =
552 "_newselect\0"
201c1cc2 553 "epoll_create\0"
215728ff 554 "epoll_create1\0"
201c1cc2
TM
555 "epoll_ctl\0"
556 "epoll_ctl_old\0"
557 "epoll_pwait\0"
34254e59 558 "epoll_pwait2\0"
201c1cc2
TM
559 "epoll_wait\0"
560 "epoll_wait_old\0"
201c1cc2 561 "eventfd\0"
215728ff 562 "eventfd2\0"
201c1cc2
TM
563 "poll\0"
564 "ppoll\0"
6ca67710 565 "ppoll_time64\0"
201c1cc2 566 "pselect6\0"
6ca67710 567 "pselect6_time64\0"
201c1cc2 568 "select\0"
8130926d
LP
569 },
570 [SYSCALL_FILTER_SET_IPC] = {
8130926d 571 .name = "@ipc",
d5efc18b
ZJS
572 .help = "SysV IPC, POSIX Message Queues or other IPC",
573 .value =
574 "ipc\0"
cd5bfd7e 575 "memfd_create\0"
201c1cc2
TM
576 "mq_getsetattr\0"
577 "mq_notify\0"
578 "mq_open\0"
579 "mq_timedreceive\0"
6ca67710 580 "mq_timedreceive_time64\0"
201c1cc2 581 "mq_timedsend\0"
6ca67710 582 "mq_timedsend_time64\0"
201c1cc2
TM
583 "mq_unlink\0"
584 "msgctl\0"
585 "msgget\0"
586 "msgrcv\0"
587 "msgsnd\0"
cd5bfd7e 588 "pipe\0"
215728ff 589 "pipe2\0"
34254e59 590 "process_madvise\0"
201c1cc2
TM
591 "process_vm_readv\0"
592 "process_vm_writev\0"
593 "semctl\0"
594 "semget\0"
595 "semop\0"
596 "semtimedop\0"
6ca67710 597 "semtimedop_time64\0"
201c1cc2
TM
598 "shmat\0"
599 "shmctl\0"
600 "shmdt\0"
601 "shmget\0"
8130926d
LP
602 },
603 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 604 .name = "@keyring",
d5efc18b 605 .help = "Kernel keyring access",
1f9ac68b
LP
606 .value =
607 "add_key\0"
608 "keyctl\0"
609 "request_key\0"
8130926d 610 },
cd0ddf6f
LP
611 [SYSCALL_FILTER_SET_MEMLOCK] = {
612 .name = "@memlock",
613 .help = "Memory locking control",
614 .value =
615 "mlock\0"
616 "mlock2\0"
617 "mlockall\0"
618 "munlock\0"
619 "munlockall\0"
620 },
8130926d 621 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 622 .name = "@module",
d5efc18b 623 .help = "Loading and unloading of kernel modules",
201c1cc2 624 .value =
201c1cc2
TM
625 "delete_module\0"
626 "finit_module\0"
627 "init_module\0"
8130926d
LP
628 },
629 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 630 .name = "@mount",
d5efc18b 631 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
632 .value =
633 "chroot\0"
9e486265
LP
634 "fsconfig\0"
635 "fsmount\0"
636 "fsopen\0"
637 "fspick\0"
201c1cc2 638 "mount\0"
34254e59 639 "mount_setattr\0"
9e486265
LP
640 "move_mount\0"
641 "open_tree\0"
201c1cc2 642 "pivot_root\0"
201c1cc2 643 "umount\0"
215728ff 644 "umount2\0"
8130926d
LP
645 },
646 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 647 .name = "@network-io",
d5efc18b 648 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 649 .value =
201c1cc2 650 "accept\0"
215728ff 651 "accept4\0"
201c1cc2
TM
652 "bind\0"
653 "connect\0"
654 "getpeername\0"
655 "getsockname\0"
656 "getsockopt\0"
657 "listen\0"
658 "recv\0"
659 "recvfrom\0"
660 "recvmmsg\0"
6ca67710 661 "recvmmsg_time64\0"
201c1cc2
TM
662 "recvmsg\0"
663 "send\0"
664 "sendmmsg\0"
665 "sendmsg\0"
666 "sendto\0"
667 "setsockopt\0"
668 "shutdown\0"
669 "socket\0"
670 "socketcall\0"
671 "socketpair\0"
8130926d
LP
672 },
673 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 674 /* some unknown even to libseccomp */
8130926d 675 .name = "@obsolete",
d5efc18b 676 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
677 .value =
678 "_sysctl\0"
679 "afs_syscall\0"
802fa07a 680 "bdflush\0"
201c1cc2 681 "break\0"
1f9ac68b 682 "create_module\0"
201c1cc2
TM
683 "ftime\0"
684 "get_kernel_syms\0"
201c1cc2
TM
685 "getpmsg\0"
686 "gtty\0"
7e0c3b8f 687 "idle\0"
201c1cc2 688 "lock\0"
201c1cc2 689 "mpx\0"
201c1cc2
TM
690 "prof\0"
691 "profil\0"
201c1cc2
TM
692 "putpmsg\0"
693 "query_module\0"
201c1cc2
TM
694 "security\0"
695 "sgetmask\0"
696 "ssetmask\0"
ae5e9bf4 697 "stime\0"
201c1cc2 698 "stty\0"
1f9ac68b 699 "sysfs\0"
201c1cc2
TM
700 "tuxcall\0"
701 "ulimit\0"
702 "uselib\0"
1f9ac68b 703 "ustat\0"
201c1cc2 704 "vserver\0"
8130926d 705 },
9493b168
ZJS
706 [SYSCALL_FILTER_SET_PKEY] = {
707 .name = "@pkey",
708 .help = "System calls used for memory protection keys",
709 .value =
710 "pkey_alloc\0"
711 "pkey_free\0"
712 "pkey_mprotect\0"
713 },
8130926d 714 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 715 .name = "@privileged",
d5efc18b 716 .help = "All system calls which need super-user capabilities",
201c1cc2 717 .value =
44898c53 718 "@chown\0"
201c1cc2
TM
719 "@clock\0"
720 "@module\0"
721 "@raw-io\0"
af0f047b
LP
722 "@reboot\0"
723 "@swap\0"
215728ff 724 "_sysctl\0"
201c1cc2 725 "acct\0"
201c1cc2 726 "bpf\0"
1f9ac68b 727 "capset\0"
201c1cc2 728 "chroot\0"
a05cfe23 729 "fanotify_init\0"
9e486265 730 "fanotify_mark\0"
201c1cc2 731 "nfsservctl\0"
a05cfe23 732 "open_by_handle_at\0"
201c1cc2
TM
733 "pivot_root\0"
734 "quotactl\0"
76e86b8d 735 "quotactl_fd\0"
201c1cc2 736 "setdomainname\0"
201c1cc2 737 "setfsuid\0"
215728ff 738 "setfsuid32\0"
201c1cc2 739 "setgroups\0"
215728ff 740 "setgroups32\0"
201c1cc2 741 "sethostname\0"
201c1cc2 742 "setresuid\0"
215728ff 743 "setresuid32\0"
201c1cc2 744 "setreuid\0"
215728ff 745 "setreuid32\0"
e05ee49b 746 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 747 "setuid32\0"
201c1cc2 748 "vhangup\0"
8130926d
LP
749 },
750 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 751 .name = "@process",
7b121df6 752 .help = "Process control, execution, namespacing operations",
201c1cc2 753 .value =
09d3020b 754 "capget\0" /* Able to query arbitrary processes */
201c1cc2 755 "clone\0"
c5503601
ZJS
756 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
757 * implement seccomp, so we don't need to list it at all. C.f.
758 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
9e486265 759 "clone3\0"
201c1cc2
TM
760 "execveat\0"
761 "fork\0"
b887d2eb 762 "getrusage\0"
201c1cc2 763 "kill\0"
9e486265 764 "pidfd_open\0"
46fcf95d 765 "pidfd_send_signal\0"
201c1cc2 766 "prctl\0"
b887d2eb
LP
767 "rt_sigqueueinfo\0"
768 "rt_tgsigqueueinfo\0"
201c1cc2 769 "setns\0"
a9518dc3 770 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 771 "tgkill\0"
b887d2eb 772 "times\0"
201c1cc2
TM
773 "tkill\0"
774 "unshare\0"
775 "vfork\0"
b887d2eb
LP
776 "wait4\0"
777 "waitid\0"
778 "waitpid\0"
8130926d
LP
779 },
780 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 781 .name = "@raw-io",
d5efc18b 782 .help = "Raw I/O port access",
201c1cc2
TM
783 .value =
784 "ioperm\0"
785 "iopl\0"
1f9ac68b 786 "pciconfig_iobase\0"
201c1cc2
TM
787 "pciconfig_read\0"
788 "pciconfig_write\0"
789 "s390_pci_mmio_read\0"
790 "s390_pci_mmio_write\0"
8130926d 791 },
bd2ab3f4
LP
792 [SYSCALL_FILTER_SET_REBOOT] = {
793 .name = "@reboot",
794 .help = "Reboot and reboot preparation/kexec",
795 .value =
bd2ab3f4 796 "kexec_file_load\0"
e59608fa 797 "kexec_load\0"
bd2ab3f4
LP
798 "reboot\0"
799 },
133ddbbe 800 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 801 .name = "@resources",
58a8f68b 802 .help = "Alter resource settings",
133ddbbe 803 .value =
0963c053
LP
804 "ioprio_set\0"
805 "mbind\0"
806 "migrate_pages\0"
807 "move_pages\0"
808 "nice\0"
0963c053
LP
809 "sched_setaffinity\0"
810 "sched_setattr\0"
133ddbbe
LP
811 "sched_setparam\0"
812 "sched_setscheduler\0"
0963c053 813 "set_mempolicy\0"
76e86b8d 814 "set_mempolicy_home_node\0"
133ddbbe
LP
815 "setpriority\0"
816 "setrlimit\0"
133ddbbe 817 },
d12632a8
LP
818 [SYSCALL_FILTER_SET_SANDBOX] = {
819 .name = "@sandbox",
820 .help = "Sandbox functionality",
821 .value =
822 "landlock_add_rule\0"
823 "landlock_create_ruleset\0"
824 "landlock_restrict_self\0"
825 "seccomp\0"
826 },
6eaaeee9
LP
827 [SYSCALL_FILTER_SET_SETUID] = {
828 .name = "@setuid",
829 .help = "Operations for changing user/group credentials",
830 .value =
6eaaeee9 831 "setgid\0"
215728ff 832 "setgid32\0"
6eaaeee9 833 "setgroups\0"
215728ff 834 "setgroups32\0"
6eaaeee9 835 "setregid\0"
215728ff 836 "setregid32\0"
6eaaeee9 837 "setresgid\0"
215728ff 838 "setresgid32\0"
6eaaeee9 839 "setresuid\0"
215728ff 840 "setresuid32\0"
6eaaeee9 841 "setreuid\0"
215728ff 842 "setreuid32\0"
6eaaeee9 843 "setuid\0"
215728ff 844 "setuid32\0"
6eaaeee9 845 },
cd0ddf6f
LP
846 [SYSCALL_FILTER_SET_SIGNAL] = {
847 .name = "@signal",
848 .help = "Process signal handling",
849 .value =
850 "rt_sigaction\0"
851 "rt_sigpending\0"
852 "rt_sigprocmask\0"
853 "rt_sigsuspend\0"
854 "rt_sigtimedwait\0"
6ca67710 855 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
856 "sigaction\0"
857 "sigaltstack\0"
858 "signal\0"
859 "signalfd\0"
860 "signalfd4\0"
861 "sigpending\0"
862 "sigprocmask\0"
863 "sigsuspend\0"
864 },
bd2ab3f4
LP
865 [SYSCALL_FILTER_SET_SWAP] = {
866 .name = "@swap",
867 .help = "Enable/disable swap devices",
868 .value =
869 "swapoff\0"
870 "swapon\0"
871 },
44898c53
LP
872 [SYSCALL_FILTER_SET_SYNC] = {
873 .name = "@sync",
874 .help = "Synchronize files and memory to storage",
875 .value =
876 "fdatasync\0"
877 "fsync\0"
878 "msync\0"
879 "sync\0"
880 "sync_file_range\0"
a8fb09f5 881 "sync_file_range2\0"
44898c53
LP
882 "syncfs\0"
883 },
70526841
LP
884 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
885 .name = "@system-service",
886 .help = "General system service operations",
887 .value =
888 "@aio\0"
889 "@basic-io\0"
890 "@chown\0"
891 "@default\0"
892 "@file-system\0"
893 "@io-event\0"
894 "@ipc\0"
895 "@keyring\0"
896 "@memlock\0"
897 "@network-io\0"
898 "@process\0"
899 "@resources\0"
900 "@setuid\0"
901 "@signal\0"
902 "@sync\0"
903 "@timer\0"
26b682e8 904 "arm_fadvise64_64\0"
70526841
LP
905 "capget\0"
906 "capset\0"
907 "copy_file_range\0"
908 "fadvise64\0"
909 "fadvise64_64\0"
910 "flock\0"
911 "get_mempolicy\0"
912 "getcpu\0"
913 "getpriority\0"
70526841
LP
914 "ioctl\0"
915 "ioprio_get\0"
916 "kcmp\0"
917 "madvise\0"
70526841
LP
918 "mremap\0"
919 "name_to_handle_at\0"
920 "oldolduname\0"
921 "olduname\0"
922 "personality\0"
923 "readahead\0"
924 "readdir\0"
925 "remap_file_pages\0"
926 "sched_get_priority_max\0"
927 "sched_get_priority_min\0"
70526841
LP
928 "sched_getattr\0"
929 "sched_getparam\0"
930 "sched_getscheduler\0"
931 "sched_rr_get_interval\0"
6ca67710 932 "sched_rr_get_interval_time64\0"
70526841
LP
933 "sched_yield\0"
934 "sendfile\0"
935 "sendfile64\0"
936 "setfsgid\0"
937 "setfsgid32\0"
938 "setfsuid\0"
939 "setfsuid32\0"
940 "setpgid\0"
941 "setsid\0"
942 "splice\0"
943 "sysinfo\0"
944 "tee\0"
945 "umask\0"
946 "uname\0"
947 "userfaultfd\0"
948 "vmsplice\0"
949 },
cd0ddf6f
LP
950 [SYSCALL_FILTER_SET_TIMER] = {
951 .name = "@timer",
952 .help = "Schedule operations by time",
953 .value =
954 "alarm\0"
955 "getitimer\0"
956 "setitimer\0"
957 "timer_create\0"
958 "timer_delete\0"
959 "timer_getoverrun\0"
960 "timer_gettime\0"
6ca67710 961 "timer_gettime64\0"
cd0ddf6f 962 "timer_settime\0"
6ca67710 963 "timer_settime64\0"
cd0ddf6f
LP
964 "timerfd_create\0"
965 "timerfd_gettime\0"
6ca67710 966 "timerfd_gettime64\0"
cd0ddf6f 967 "timerfd_settime\0"
6ca67710 968 "timerfd_settime64\0"
cd0ddf6f
LP
969 "times\0"
970 },
95aac012
ZJS
971 [SYSCALL_FILTER_SET_KNOWN] = {
972 .name = "@known",
973 .help = "All known syscalls declared in the kernel",
974 .value =
6d6a0854 975 "@obsolete\0"
95aac012
ZJS
976#include "syscall-list.h"
977 },
201c1cc2 978};
8130926d
LP
979
980const SyscallFilterSet *syscall_filter_set_find(const char *name) {
8130926d
LP
981 if (isempty(name) || name[0] != '@')
982 return NULL;
983
077e8fc0 984 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
8130926d
LP
985 if (streq(syscall_filter_sets[i].name, name))
986 return syscall_filter_sets + i;
987
988 return NULL;
989}
990
000c0520
ZJS
991static int add_syscall_filter_set(
992 scmp_filter_ctx seccomp,
993 const SyscallFilterSet *set,
994 uint32_t action,
995 char **exclude,
996 bool log_missing,
997 char ***added);
998
999int seccomp_add_syscall_filter_item(
1000 scmp_filter_ctx *seccomp,
1001 const char *name,
1002 uint32_t action,
1003 char **exclude,
1004 bool log_missing,
1005 char ***added) {
69b1b241
LP
1006
1007 assert(seccomp);
1008 assert(name);
1009
960e4569
LP
1010 if (strv_contains(exclude, name))
1011 return 0;
1012
000c0520
ZJS
1013 /* Any syscalls that are handled are added to the *added strv. The pointer
1014 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
1015
69b1b241
LP
1016 if (name[0] == '@') {
1017 const SyscallFilterSet *other;
1018
1019 other = syscall_filter_set_find(name);
baaa35ad
ZJS
1020 if (!other)
1021 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
1022 "Filter set %s is not known!",
1023 name);
69b1b241 1024
000c0520 1025 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
b54f36c6 1026
69b1b241 1027 } else {
b54f36c6 1028 int id, r;
69b1b241
LP
1029
1030 id = seccomp_syscall_resolve_name(name);
cff7bff8 1031 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
1032 if (log_missing)
1033 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 1034 return 0;
cff7bff8 1035 }
69b1b241
LP
1036
1037 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 1038 if (r < 0) {
69b1b241 1039 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
1040 bool ignore = r == -EDOM;
1041
1042 if (!ignore || log_missing)
1043 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1044 name, id, ignore ? ", ignoring" : "");
1045 if (!ignore)
1046 return r;
b54f36c6 1047 }
69b1b241 1048
000c0520
ZJS
1049 if (added) {
1050 r = strv_extend(added, name);
1051 if (r < 0)
1052 return r;
1053 }
1054
b54f36c6
ZJS
1055 return 0;
1056 }
69b1b241
LP
1057}
1058
000c0520 1059static int add_syscall_filter_set(
469830d1 1060 scmp_filter_ctx seccomp,
469830d1 1061 const SyscallFilterSet *set,
960e4569 1062 uint32_t action,
b54f36c6 1063 char **exclude,
000c0520
ZJS
1064 bool log_missing,
1065 char ***added) {
469830d1 1066
8130926d
LP
1067 int r;
1068
000c0520
ZJS
1069 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1070
8130926d
LP
1071 assert(seccomp);
1072 assert(set);
1073
1074 NULSTR_FOREACH(sys, set->value) {
000c0520 1075 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
69b1b241
LP
1076 if (r < 0)
1077 return r;
469830d1
LP
1078 }
1079
1080 return 0;
1081}
1082
03c0730f
YW
1083static uint32_t override_default_action(uint32_t default_action) {
1084 /* When the requested filter is an allow-list, and the default action is something critical, we
1085 * install ENOSYS as the default action, but it will only apply to syscalls which are not in the
1086 * @known set. */
1087
1088 if (default_action == SCMP_ACT_ALLOW)
1089 return default_action;
1090
1091#ifdef SCMP_ACT_LOG
1092 if (default_action == SCMP_ACT_LOG)
1093 return default_action;
1094#endif
1095
1096 return SCMP_ACT_ERRNO(ENOSYS);
1097}
1098
b54f36c6 1099int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
03c0730f 1100 uint32_t arch, default_action_override;
469830d1
LP
1101 int r;
1102
1103 assert(set);
1104
1105 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 1106 * each local arch. */
469830d1 1107
78b2ad7d
YW
1108 default_action_override = override_default_action(default_action);
1109
469830d1
LP
1110 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1111 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78b2ad7d 1112 _cleanup_strv_free_ char **added = NULL;
469830d1 1113
30868c1c 1114 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1115
78b2ad7d 1116 r = seccomp_init_for_arch(&seccomp, arch, default_action_override);
8130926d
LP
1117 if (r < 0)
1118 return r;
469830d1 1119
78b2ad7d 1120 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, &added);
7e86bd73
ZJS
1121 if (r < 0)
1122 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1 1123
78b2ad7d
YW
1124 if (default_action != default_action_override)
1125 NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
1126 int id;
1127
1128 id = seccomp_syscall_resolve_name(name);
1129 if (id < 0)
1130 continue;
1131
1132 /* Ignore the syscall if it was already handled above */
1133 if (strv_contains(added, name))
1134 continue;
1135
1136 r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
1137 if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */
1138 return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
1139 name, id);
1140 }
1141
1142#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
1143 /* We have a large filter here, so let's turn on the binary tree mode if possible. */
1144 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2);
1145 if (r < 0)
1146 log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
1147#endif
1148
469830d1 1149 r = seccomp_load(seccomp);
3c098014
ZJS
1150 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1151 return r;
1152 if (r < 0)
1153 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m",
1154 seccomp_arch_to_string(arch));
8130926d
LP
1155 }
1156
1157 return 0;
1158}
a3be2849 1159
1862b310 1160int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
03c0730f 1161 uint32_t arch, default_action_override;
a3be2849
LP
1162 int r;
1163
1862b310
YW
1164 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1165 * of a SyscallFilterSet* table. */
a3be2849 1166
1862b310 1167 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
469830d1 1168 return 0;
a3be2849 1169
03c0730f
YW
1170 default_action_override = override_default_action(default_action);
1171
469830d1
LP
1172 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1173 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b54f36c6 1174 void *syscall_id, *val;
a3be2849 1175
30868c1c 1176 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1177
03c0730f 1178 r = seccomp_init_for_arch(&seccomp, arch, default_action_override);
469830d1
LP
1179 if (r < 0)
1180 return r;
a3be2849 1181
1862b310 1182 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
8cfa775f 1183 uint32_t a = action;
b54f36c6
ZJS
1184 int id = PTR_TO_INT(syscall_id) - 1;
1185 int error = PTR_TO_INT(val);
8cfa775f 1186
005bfaf1
TM
1187 if (error == SECCOMP_ERROR_NUMBER_KILL)
1188 a = scmp_act_kill_process();
9df2cdd8
TM
1189#ifdef SCMP_ACT_LOG
1190 else if (action == SCMP_ACT_LOG)
1191 a = SCMP_ACT_LOG;
1192#endif
68acc1af 1193 else if (error >= 0)
b54f36c6 1194 a = SCMP_ACT_ERRNO(error);
8cfa775f 1195
b54f36c6 1196 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1 1197 if (r < 0) {
1862b310
YW
1198 /* If the system call is not known on this architecture, then that's
1199 * fine, let's ignore it */
469830d1 1200 _cleanup_free_ char *n = NULL;
7e86bd73 1201 bool ignore;
469830d1 1202
b54f36c6 1203 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1204 ignore = r == -EDOM;
1205 if (!ignore || log_missing)
1206 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1207 strna(n), id, ignore ? ", ignoring" : "");
1208 if (!ignore)
1209 return r;
469830d1
LP
1210 }
1211 }
1212
03c0730f
YW
1213 if (default_action != default_action_override)
1214 NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
1215 int id;
2331c02d 1216
03c0730f
YW
1217 id = seccomp_syscall_resolve_name(name);
1218 if (id < 0)
1219 continue;
2331c02d 1220
03c0730f
YW
1221 /* Ignore the syscall if it was already handled above */
1222 if (hashmap_contains(filter, INT_TO_PTR(id + 1)))
1223 continue;
2331c02d 1224
03c0730f
YW
1225 r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
1226 if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */
1227 return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
1228 name, id);
1229 }
2331c02d 1230
e6c5386d
ZJS
1231#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
1232 /* We have a large filter here, so let's turn on the binary tree mode if possible. */
1233 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2);
1234 if (r < 0)
1235 log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
1236#endif
1237
469830d1 1238 r = seccomp_load(seccomp);
3c098014
ZJS
1239 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1240 return r;
1241 if (r < 0)
a52765a5 1242 log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
1862b310 1243 seccomp_arch_to_string(arch));
469830d1
LP
1244 }
1245
1246 return 0;
add00535
LP
1247}
1248
58f6ab44 1249int seccomp_parse_syscall_filter(
898748d8
YW
1250 const char *name,
1251 int errno_num,
1252 Hashmap *filter,
13d92c63 1253 SeccompParseFlags flags,
898748d8
YW
1254 const char *unit,
1255 const char *filename,
1256 unsigned line) {
1257
1258 int r;
1259
1260 assert(name);
1261 assert(filter);
1262
084a46d7
YW
1263 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1264 return -EINVAL;
1265
898748d8
YW
1266 if (name[0] == '@') {
1267 const SyscallFilterSet *set;
898748d8
YW
1268
1269 set = syscall_filter_set_find(name);
1270 if (!set) {
9e29ee40 1271 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1272 return -EINVAL;
13d92c63 1273
9e29ee40 1274 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
13d92c63
LP
1275 "Unknown system call group, ignoring: %s", name);
1276 return 0;
898748d8
YW
1277 }
1278
1279 NULSTR_FOREACH(i, set->value) {
3c098014
ZJS
1280 /* Call ourselves again, for the group to parse. Note that we downgrade logging here
1281 * (i.e. take away the SECCOMP_PARSE_LOG flag) since any issues in the group table
1282 * are our own problem, not a problem in user configuration data and we shouldn't
1283 * pretend otherwise by complaining about them. */
58f6ab44 1284 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1285 if (r < 0)
1286 return r;
1287 }
1288 } else {
1289 int id;
1290
1291 id = seccomp_syscall_resolve_name(name);
1292 if (id == __NR_SCMP_ERROR) {
9e29ee40 1293 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1294 return -EINVAL;
13d92c63 1295
9e29ee40 1296 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
309a4212 1297 "System call %s is not known, ignoring.", name);
13d92c63 1298 return 0;
898748d8
YW
1299 }
1300
3c098014
ZJS
1301 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove it
1302 * from the list. The entries in allow-list with non-negative error value will be handled
1303 * with SCMP_ACT_ERRNO() instead of the default action. */
68acc1af
YW
1304 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1305 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
898748d8
YW
1306 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1307 if (r < 0)
851ee70a
LW
1308 switch (r) {
1309 case -ENOMEM:
9e29ee40 1310 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
851ee70a 1311 case -EEXIST:
9d7fe7c6
LW
1312 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1313 break;
851ee70a
LW
1314 default:
1315 return r;
1316 }
898748d8
YW
1317 } else
1318 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1319 }
1320
1321 return 0;
1322}
1323
add00535 1324int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1325 uint32_t arch;
add00535
LP
1326 int r;
1327
f1d34068 1328 if (DEBUG_LOGGING) {
add00535
LP
1329 _cleanup_free_ char *s = NULL;
1330
86c2a9f1 1331 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1332 log_debug("Restricting namespace to: %s.", strna(s));
1333 }
1334
1335 /* NOOP? */
d7a0f1f4 1336 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
add00535
LP
1337 return 0;
1338
469830d1
LP
1339 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1340 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
add00535 1341
30868c1c 1342 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1343
1344 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1345 if (r < 0)
1346 return r;
1347
30193fe8
ZJS
1348 /* We cannot filter on individual flags to clone3(), and we need to disable the
1349 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1350 * users shall fall back to clone(), as if on an older kernel.
1351 *
1352 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1353 * https://github.com/moby/moby/issues/42680. */
1354
1355 r = seccomp_rule_add_exact(
1356 seccomp,
1357 SCMP_ACT_ERRNO(ENOSYS),
1358 SCMP_SYS(clone3),
1359 0);
1360 if (r < 0)
3c098014
ZJS
1361 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m",
1362 seccomp_arch_to_string(arch));
30193fe8 1363
469830d1 1364 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
3c098014
ZJS
1365 /* If every single kind of namespace shall be prohibited, then let's block the whole
1366 * setns() syscall altogether. */
469830d1
LP
1367 r = seccomp_rule_add_exact(
1368 seccomp,
1369 SCMP_ACT_ERRNO(EPERM),
1370 SCMP_SYS(setns),
1371 0);
1372 else
3c098014
ZJS
1373 /* Otherwise, block only the invocations with the appropriate flags in the loop
1374 * below, but also the special invocation with a zero flags argument, right here. */
469830d1
LP
1375 r = seccomp_rule_add_exact(
1376 seccomp,
1377 SCMP_ACT_ERRNO(EPERM),
1378 SCMP_SYS(setns),
1379 1,
1380 SCMP_A1(SCMP_CMP_EQ, 0));
1381 if (r < 0) {
3c098014
ZJS
1382 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1383 seccomp_arch_to_string(arch));
469830d1
LP
1384 continue;
1385 }
1386
241b1577 1387 for (unsigned i = 0; namespace_info[i].proc_name; i++) {
469830d1
LP
1388 unsigned long f;
1389
241b1577 1390 f = namespace_info[i].clone_flag;
d7a0f1f4 1391 if (FLAGS_SET(retain, f)) {
241b1577 1392 log_debug("Permitting %s.", namespace_info[i].proc_name);
469830d1
LP
1393 continue;
1394 }
1395
30868c1c 1396 log_trace("Blocking %s.", namespace_info[i].proc_name);
469830d1
LP
1397
1398 r = seccomp_rule_add_exact(
1399 seccomp,
1400 SCMP_ACT_ERRNO(EPERM),
1401 SCMP_SYS(unshare),
1402 1,
1403 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1404 if (r < 0) {
3c098014
ZJS
1405 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m",
1406 seccomp_arch_to_string(arch));
469830d1
LP
1407 break;
1408 }
1409
511ceb1f
ZJS
1410 /* On s390/s390x the first two parameters to clone are switched */
1411 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1412 r = seccomp_rule_add_exact(
1413 seccomp,
1414 SCMP_ACT_ERRNO(EPERM),
1415 SCMP_SYS(clone),
1416 1,
1417 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1418 else
1419 r = seccomp_rule_add_exact(
1420 seccomp,
1421 SCMP_ACT_ERRNO(EPERM),
1422 SCMP_SYS(clone),
1423 1,
1424 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1 1425 if (r < 0) {
3c098014
ZJS
1426 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m",
1427 seccomp_arch_to_string(arch));
469830d1
LP
1428 break;
1429 }
1430
1431 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1432 r = seccomp_rule_add_exact(
1433 seccomp,
1434 SCMP_ACT_ERRNO(EPERM),
1435 SCMP_SYS(setns),
1436 1,
1437 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1438 if (r < 0) {
3c098014
ZJS
1439 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1440 seccomp_arch_to_string(arch));
469830d1
LP
1441 break;
1442 }
1443 }
1444 }
1445 if (r < 0)
1446 continue;
1447
1448 r = seccomp_load(seccomp);
3c098014
ZJS
1449 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1450 return r;
1451 if (r < 0)
1452 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m",
1453 seccomp_arch_to_string(arch));
469830d1
LP
1454 }
1455
1456 return 0;
1457}
1458
1459int seccomp_protect_sysctl(void) {
1460 uint32_t arch;
1461 int r;
1462
1463 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1464 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1465
30868c1c 1466 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1467
f9252236
AJ
1468 if (IN_SET(arch,
1469 SCMP_ARCH_AARCH64,
f9d3fb6b
XW
1470#ifdef SCMP_ARCH_LOONGARCH64
1471 SCMP_ARCH_LOONGARCH64,
1472#endif
f9252236
AJ
1473#ifdef SCMP_ARCH_RISCV64
1474 SCMP_ARCH_RISCV64,
1475#endif
1476 SCMP_ARCH_X32
1477 ))
2e64e8f4
ZJS
1478 /* No _sysctl syscall */
1479 continue;
1480
469830d1
LP
1481 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1482 if (r < 0)
1483 return r;
1484
1485 r = seccomp_rule_add_exact(
add00535
LP
1486 seccomp,
1487 SCMP_ACT_ERRNO(EPERM),
469830d1 1488 SCMP_SYS(_sysctl),
add00535 1489 0);
469830d1 1490 if (r < 0) {
3c098014
ZJS
1491 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m",
1492 seccomp_arch_to_string(arch));
469830d1
LP
1493 continue;
1494 }
1495
1496 r = seccomp_load(seccomp);
3c098014
ZJS
1497 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1498 return r;
1499 if (r < 0)
1500 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m",
1501 seccomp_arch_to_string(arch));
469830d1
LP
1502 }
1503
1504 return 0;
1505}
1506
620dbdd2
KK
1507int seccomp_protect_syslog(void) {
1508 uint32_t arch;
1509 int r;
1510
1511 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1512 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1513
1514 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1515 if (r < 0)
1516 return r;
1517
1518 r = seccomp_rule_add_exact(
1519 seccomp,
1520 SCMP_ACT_ERRNO(EPERM),
1521 SCMP_SYS(syslog),
1522 0);
1523
1524 if (r < 0) {
1525 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1526 continue;
1527 }
1528
1529 r = seccomp_load(seccomp);
3c098014
ZJS
1530 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1531 return r;
1532 if (r < 0)
1533 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m",
1534 seccomp_arch_to_string(arch));
620dbdd2
KK
1535 }
1536
1537 return 0;
1538}
1539
6b000af4 1540int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1541 uint32_t arch;
1542 int r;
1543
1544 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1545 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1546 bool supported;
469830d1 1547
30868c1c 1548 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1549
9606bc4b
LP
1550 switch (arch) {
1551
1552 case SCMP_ARCH_X86_64:
1553 case SCMP_ARCH_X32:
1554 case SCMP_ARCH_ARM:
1555 case SCMP_ARCH_AARCH64:
f9d3fb6b
XW
1556#ifdef SCMP_ARCH_LOONGARCH64
1557 case SCMP_ARCH_LOONGARCH64:
1558#endif
f5aeac14
JC
1559 case SCMP_ARCH_MIPSEL64N32:
1560 case SCMP_ARCH_MIPS64N32:
1561 case SCMP_ARCH_MIPSEL64:
1562 case SCMP_ARCH_MIPS64:
f9252236
AJ
1563#ifdef SCMP_ARCH_RISCV64
1564 case SCMP_ARCH_RISCV64:
1565#endif
9606bc4b
LP
1566 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1567 supported = true;
1568 break;
1569
9606bc4b
LP
1570 case SCMP_ARCH_S390:
1571 case SCMP_ARCH_S390X:
da1921a5 1572 case SCMP_ARCH_X86:
f5aeac14
JC
1573 case SCMP_ARCH_MIPSEL:
1574 case SCMP_ARCH_MIPS:
344e6b62
SJ
1575#ifdef SCMP_ARCH_PARISC
1576 case SCMP_ARCH_PARISC:
1577#endif
1578#ifdef SCMP_ARCH_PARISC64
1579 case SCMP_ARCH_PARISC64:
1580#endif
d5923e38
ZJS
1581 case SCMP_ARCH_PPC:
1582 case SCMP_ARCH_PPC64:
1583 case SCMP_ARCH_PPC64LE:
9606bc4b
LP
1584 default:
1585 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1586 * don't know */
1587 supported = false;
1588 break;
1589 }
1590
1591 if (!supported)
1592 continue;
1593
469830d1
LP
1594 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1595 if (r < 0)
1596 return r;
1597
6b000af4 1598 if (allow_list) {
077e8fc0 1599 int first = 0, last = 0;
469830d1
LP
1600 void *afp;
1601
6b000af4
LP
1602 /* If this is an allow list, we first block the address families that are out of
1603 * range and then everything that is not in the set. First, we find the lowest and
1604 * highest address family in the set. */
469830d1 1605
90e74a66 1606 SET_FOREACH(afp, address_families) {
077e8fc0 1607 int af = PTR_TO_INT(afp);
469830d1
LP
1608
1609 if (af <= 0 || af >= af_max())
1610 continue;
1611
1612 if (first == 0 || af < first)
1613 first = af;
1614
1615 if (last == 0 || af > last)
1616 last = af;
1617 }
1618
1619 assert((first == 0) == (last == 0));
1620
1621 if (first == 0) {
1622
1623 /* No entries in the valid range, block everything */
1624 r = seccomp_rule_add_exact(
1625 seccomp,
1626 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1627 SCMP_SYS(socket),
1628 0);
1629 if (r < 0) {
3c098014
ZJS
1630 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1631 seccomp_arch_to_string(arch));
469830d1
LP
1632 continue;
1633 }
1634
1635 } else {
1636
1637 /* Block everything below the first entry */
1638 r = seccomp_rule_add_exact(
1639 seccomp,
1640 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1641 SCMP_SYS(socket),
1642 1,
1643 SCMP_A0(SCMP_CMP_LT, first));
1644 if (r < 0) {
3c098014
ZJS
1645 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1646 seccomp_arch_to_string(arch));
469830d1
LP
1647 continue;
1648 }
1649
1650 /* Block everything above the last entry */
1651 r = seccomp_rule_add_exact(
1652 seccomp,
1653 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1654 SCMP_SYS(socket),
1655 1,
1656 SCMP_A0(SCMP_CMP_GT, last));
1657 if (r < 0) {
3c098014
ZJS
1658 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1659 seccomp_arch_to_string(arch));
469830d1
LP
1660 continue;
1661 }
1662
1663 /* Block everything between the first and last entry */
077e8fc0 1664 for (int af = 1; af < af_max(); af++) {
469830d1
LP
1665
1666 if (set_contains(address_families, INT_TO_PTR(af)))
1667 continue;
1668
1669 r = seccomp_rule_add_exact(
1670 seccomp,
1671 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1672 SCMP_SYS(socket),
1673 1,
1674 SCMP_A0(SCMP_CMP_EQ, af));
1675 if (r < 0)
1676 break;
1677 }
469830d1 1678 if (r < 0) {
3c098014
ZJS
1679 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1680 seccomp_arch_to_string(arch));
469830d1
LP
1681 continue;
1682 }
1683 }
1684
1685 } else {
1686 void *af;
1687
6b000af4
LP
1688 /* If this is a deny list, then generate one rule for each address family that are
1689 * then combined in OR checks. */
469830d1 1690
90e74a66 1691 SET_FOREACH(af, address_families) {
469830d1
LP
1692 r = seccomp_rule_add_exact(
1693 seccomp,
1694 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1695 SCMP_SYS(socket),
1696 1,
1697 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1698 if (r < 0)
1699 break;
1700 }
469830d1 1701 if (r < 0) {
3c098014
ZJS
1702 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1703 seccomp_arch_to_string(arch));
469830d1
LP
1704 continue;
1705 }
1706 }
1707
1708 r = seccomp_load(seccomp);
3c098014
ZJS
1709 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1710 return r;
1711 if (r < 0)
1712 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m",
1713 seccomp_arch_to_string(arch));
469830d1
LP
1714 }
1715
1716 return 0;
1717}
1718
a9002749 1719int seccomp_restrict_realtime_full(int error_code) {
469830d1
LP
1720 static const int permitted_policies[] = {
1721 SCHED_OTHER,
1722 SCHED_BATCH,
1723 SCHED_IDLE,
1724 };
1725
1726 int r, max_policy = 0;
1727 uint32_t arch;
1728 unsigned i;
1729
a9002749
YW
1730 assert(error_code > 0);
1731
469830d1
LP
1732 /* Determine the highest policy constant we want to allow */
1733 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1734 if (permitted_policies[i] > max_policy)
1735 max_policy = permitted_policies[i];
1736
1737 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1738 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1739 int p;
1740
30868c1c 1741 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1742
1743 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1744 if (r < 0)
1745 return r;
1746
1747 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1748 * allow list. */
469830d1
LP
1749 for (p = 0; p < max_policy; p++) {
1750 bool good = false;
1751
6b000af4 1752 /* Check if this is in the allow list. */
469830d1
LP
1753 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1754 if (permitted_policies[i] == p) {
1755 good = true;
1756 break;
1757 }
1758
1759 if (good)
1760 continue;
1761
1762 /* Deny this policy */
1763 r = seccomp_rule_add_exact(
1764 seccomp,
a9002749 1765 SCMP_ACT_ERRNO(error_code),
469830d1
LP
1766 SCMP_SYS(sched_setscheduler),
1767 1,
1768 SCMP_A1(SCMP_CMP_EQ, p));
1769 if (r < 0) {
3c098014
ZJS
1770 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1771 seccomp_arch_to_string(arch));
469830d1
LP
1772 continue;
1773 }
1774 }
1775
6b000af4
LP
1776 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1777 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1778 r = seccomp_rule_add_exact(
add00535 1779 seccomp,
a9002749 1780 SCMP_ACT_ERRNO(error_code),
469830d1 1781 SCMP_SYS(sched_setscheduler),
add00535 1782 1,
469830d1
LP
1783 SCMP_A1(SCMP_CMP_GT, max_policy));
1784 if (r < 0) {
3c098014
ZJS
1785 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1786 seccomp_arch_to_string(arch));
469830d1
LP
1787 continue;
1788 }
add00535 1789
469830d1 1790 r = seccomp_load(seccomp);
3c098014
ZJS
1791 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1792 return r;
1793 if (r < 0)
1794 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m",
1795 seccomp_arch_to_string(arch));
469830d1
LP
1796 }
1797
1798 return 0;
1799}
1800
6dc66688
ZJS
1801static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1802 uint32_t arch,
1803 int nr,
14cb109d 1804 unsigned arg_cnt,
6dc66688
ZJS
1805 const struct scmp_arg_cmp arg) {
1806 int r;
1807
1808 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1809 if (r < 0) {
1810 _cleanup_free_ char *n = NULL;
1811
1812 n = seccomp_syscall_resolve_num_arch(arch, nr);
1813 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1814 strna(n),
1815 seccomp_arch_to_string(arch));
1816 }
1817
1818 return r;
1819}
1820
2a8d6e63 1821/* For known architectures, check that syscalls are indeed defined or not. */
f9d3fb6b 1822#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64)
2a8d6e63
ZJS
1823assert_cc(SCMP_SYS(shmget) > 0);
1824assert_cc(SCMP_SYS(shmat) > 0);
1825assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1826#endif
6dc66688 1827
469830d1
LP
1828int seccomp_memory_deny_write_execute(void) {
1829 uint32_t arch;
b069c2a3 1830 unsigned loaded = 0;
469830d1
LP
1831
1832 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1833 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1834 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1835
30868c1c 1836 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1837
8a50cf69
LP
1838 switch (arch) {
1839
bed4668d
CE
1840 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1841 * We ignore that here, which means there's still a way to get writable/executable
344e6b62
SJ
1842 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1843 *
1844 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1845 * on that front (kernel work done in 5.18).
1846 */
bed4668d 1847
8a50cf69 1848 case SCMP_ARCH_X86:
57311925 1849 case SCMP_ARCH_S390:
8a50cf69
LP
1850 filter_syscall = SCMP_SYS(mmap2);
1851 block_syscall = SCMP_SYS(mmap);
bed4668d 1852 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1853 break;
1854
63d00dfb 1855 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1856 case SCMP_ARCH_PPC64:
1857 case SCMP_ARCH_PPC64LE:
bed4668d 1858 case SCMP_ARCH_S390X:
2a8d6e63 1859 filter_syscall = SCMP_SYS(mmap);
bed4668d 1860 /* shmat multiplexed, see above */
8a50cf69
LP
1861 break;
1862
4278d1f5
ZJS
1863 case SCMP_ARCH_ARM:
1864 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1865 shmat_syscall = SCMP_SYS(shmat);
1866 break;
1867
8a50cf69
LP
1868 case SCMP_ARCH_X86_64:
1869 case SCMP_ARCH_X32:
79873bc8 1870 case SCMP_ARCH_AARCH64:
f9d3fb6b
XW
1871#ifdef SCMP_ARCH_LOONGARCH64
1872 case SCMP_ARCH_LOONGARCH64:
1873#endif
f9252236
AJ
1874#ifdef SCMP_ARCH_RISCV64
1875 case SCMP_ARCH_RISCV64:
1876#endif
f9d3fb6b 1877 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */
8a50cf69
LP
1878 shmat_syscall = SCMP_SYS(shmat);
1879 break;
1880
1881 /* Please add more definitions here, if you port systemd to other architectures! */
1882
f9d3fb6b 1883#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64)
8a50cf69
LP
1884#warning "Consider adding the right mmap() syscall definitions here!"
1885#endif
1886 }
1887
1888 /* Can't filter mmap() on this arch, then skip it */
1889 if (filter_syscall == 0)
1890 continue;
1891
469830d1
LP
1892 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1893 if (r < 0)
1894 return r;
1895
6dc66688
ZJS
1896 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1897 1,
1898 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1899 if (r < 0)
1900 continue;
8a50cf69
LP
1901
1902 if (block_syscall != 0) {
6dc66688
ZJS
1903 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1904 if (r < 0)
8a50cf69 1905 continue;
add00535 1906 }
a3be2849 1907
6dc66688
ZJS
1908 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1909 1,
b835eeb4
ZJS
1910 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1911 if (r < 0)
1912 continue;
1913
1914 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1915 1,
6dc66688
ZJS
1916 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1917 if (r < 0)
469830d1 1918 continue;
add00535 1919
67fb5f33 1920 if (shmat_syscall > 0) {
5ef3ed97 1921 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1922 1,
1923 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1924 if (r < 0)
8a50cf69 1925 continue;
469830d1
LP
1926 }
1927
1928 r = seccomp_load(seccomp);
3c098014
ZJS
1929 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1930 return r;
1931 if (r < 0)
b069c2a3
ZJS
1932 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1933 seccomp_arch_to_string(arch));
903659e7 1934 loaded++;
469830d1 1935 }
add00535 1936
903659e7 1937 if (loaded == 0)
b069c2a3 1938 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1939
1940 return loaded;
469830d1
LP
1941}
1942
1943int seccomp_restrict_archs(Set *archs) {
1944 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
469830d1 1945 int r;
65976868 1946 bool blocked_new = false;
469830d1
LP
1947
1948 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1949 * list.
1950 *
1951 * There are some qualifications. However the most important use is to stop processes from bypassing
1952 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1953 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1954
2428aaf8
AJ
1955 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1956 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1957 * to run a program with the restrictions applied. */
469830d1
LP
1958 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1959 if (!seccomp)
1960 return -ENOMEM;
1961
65976868
GDF
1962 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1963 uint32_t arch = seccomp_local_archs[i];
2428aaf8 1964
f833df38
BB
1965 /* See above comment, our "native" architecture is never blocked. */
1966 if (arch == seccomp_arch_native())
1967 continue;
1968
65976868
GDF
1969 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1970 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1971 continue;
2428aaf8 1972
65976868 1973 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
2428aaf8 1974
65976868
GDF
1975 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1976 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1977 * The important thing is that you can block the old 32-bit x86 syscalls.
1978 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1979 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1980 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1981
1982 if (block) {
1983 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1984 blocked_new = true;
1985 } else {
1986 r = seccomp_arch_add(seccomp, arch);
1987 if (r < 0 && r != -EEXIST)
1988 return r;
1989 }
add00535
LP
1990 }
1991
65976868
GDF
1992 /* All architectures that will be blocked by the seccomp program were
1993 * already blocked. */
1994 if (!blocked_new)
1995 return 0;
1996
469830d1
LP
1997 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1998 if (r < 0)
1999 return r;
add00535 2000
1c6af69b 2001 r = seccomp_load(seccomp);
3c098014
ZJS
2002 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2003 return r;
2004 if (r < 0)
1c6af69b
LP
2005 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
2006
2007 return 0;
a3be2849 2008}
b16bd535 2009
de7fef4b
ZJS
2010int parse_syscall_archs(char **l, Set **ret_archs) {
2011 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
2012 int r;
2013
2014 assert(l);
de7fef4b 2015 assert(ret_archs);
b16bd535
YW
2016
2017 STRV_FOREACH(s, l) {
2018 uint32_t a;
2019
2020 r = seccomp_arch_from_string(*s, &a);
2021 if (r < 0)
2022 return -EINVAL;
2023
de7fef4b 2024 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
2025 if (r < 0)
2026 return -ENOMEM;
2027 }
2028
de7fef4b 2029 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
2030 return 0;
2031}
165a31c0 2032
8cfa775f 2033int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
2034 int r;
2035
2036 assert(set);
2037
2038 NULSTR_FOREACH(i, set->value) {
2039
2040 if (i[0] == '@') {
2041 const SyscallFilterSet *more;
2042
2043 more = syscall_filter_set_find(i);
2044 if (!more)
2045 return -ENXIO;
2046
165a31c0
LP
2047 r = seccomp_filter_set_add(filter, add, more);
2048 if (r < 0)
2049 return r;
2050 } else {
2051 int id;
2052
2053 id = seccomp_syscall_resolve_name(i);
ff217dc3 2054 if (id == __NR_SCMP_ERROR) {
309a4212 2055 log_debug("System call %s is not known, ignoring.", i);
ff217dc3
LP
2056 continue;
2057 }
165a31c0
LP
2058
2059 if (add) {
8cfa775f 2060 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
2061 if (r < 0)
2062 return r;
2063 } else
8cfa775f 2064 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
2065 }
2066 }
2067
2068 return 0;
2069}
78e864e5
TM
2070
2071int seccomp_lock_personality(unsigned long personality) {
72eafe71 2072 uint32_t arch;
78e864e5
TM
2073 int r;
2074
72eafe71
LP
2075 if (personality >= PERSONALITY_INVALID)
2076 return -EINVAL;
78e864e5 2077
72eafe71
LP
2078 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2079 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 2080
72eafe71
LP
2081 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2082 if (r < 0)
2083 return r;
2084
2085 r = seccomp_rule_add_exact(
2086 seccomp,
2087 SCMP_ACT_ERRNO(EPERM),
2088 SCMP_SYS(personality),
2089 1,
2090 SCMP_A0(SCMP_CMP_NE, personality));
448ac526 2091 if (r < 0) {
3c098014
ZJS
2092 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
2093 seccomp_arch_to_string(arch));
448ac526
LP
2094 continue;
2095 }
72eafe71
LP
2096
2097 r = seccomp_load(seccomp);
3c098014
ZJS
2098 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2099 return r;
2100 if (r < 0)
2101 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m",
2102 seccomp_arch_to_string(arch));
72eafe71
LP
2103 }
2104
2105 return 0;
78e864e5 2106}
aecd5ac6
TM
2107
2108int seccomp_protect_hostname(void) {
2109 uint32_t arch;
2110 int r;
2111
2112 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2113 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2114
2115 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2116 if (r < 0)
2117 return r;
2118
2119 r = seccomp_rule_add_exact(
2120 seccomp,
2121 SCMP_ACT_ERRNO(EPERM),
2122 SCMP_SYS(sethostname),
2123 0);
9e6e543c 2124 if (r < 0) {
3c098014
ZJS
2125 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m",
2126 seccomp_arch_to_string(arch));
aecd5ac6 2127 continue;
9e6e543c 2128 }
aecd5ac6
TM
2129
2130 r = seccomp_rule_add_exact(
2131 seccomp,
2132 SCMP_ACT_ERRNO(EPERM),
2133 SCMP_SYS(setdomainname),
2134 0);
9e6e543c 2135 if (r < 0) {
3c098014
ZJS
2136 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m",
2137 seccomp_arch_to_string(arch));
aecd5ac6 2138 continue;
9e6e543c 2139 }
aecd5ac6
TM
2140
2141 r = seccomp_load(seccomp);
3c098014
ZJS
2142 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2143 return r;
2144 if (r < 0)
2145 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m",
2146 seccomp_arch_to_string(arch));
aecd5ac6
TM
2147 }
2148
2149 return 0;
2150}
3c27973b 2151
da4dc9a6
ZJS
2152static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
2153 /* Checks the mode_t parameter of the following system calls:
2154 *
8b45281d 2155 * → chmod() + fchmod() + fchmodat() + fchmodat2()
da4dc9a6
ZJS
2156 * → open() + creat() + openat()
2157 * → mkdir() + mkdirat()
2158 * → mknod() + mknodat()
2159 *
2160 * Returns error if *everything* failed, and 0 otherwise.
2161 */
6d95e7d9 2162 int r;
da4dc9a6
ZJS
2163 bool any = false;
2164
2165 r = seccomp_rule_add_exact(
2166 seccomp,
2167 SCMP_ACT_ERRNO(EPERM),
2168 SCMP_SYS(chmod),
2169 1,
2170 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2171 if (r < 0)
2172 log_debug_errno(r, "Failed to add filter for chmod: %m");
2173 else
2174 any = true;
2175
2176 r = seccomp_rule_add_exact(
2177 seccomp,
2178 SCMP_ACT_ERRNO(EPERM),
2179 SCMP_SYS(fchmod),
2180 1,
2181 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2182 if (r < 0)
2183 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2184 else
2185 any = true;
2186
2187 r = seccomp_rule_add_exact(
2188 seccomp,
2189 SCMP_ACT_ERRNO(EPERM),
2190 SCMP_SYS(fchmodat),
2191 1,
2192 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2193 if (r < 0)
2194 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2195 else
2196 any = true;
2197
8b45281d
AM
2198#if defined(__SNR_fchmodat2)
2199 r = seccomp_rule_add_exact(
2200 seccomp,
2201 SCMP_ACT_ERRNO(EPERM),
2202 SCMP_SYS(fchmodat2),
2203 1,
2204 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2205#else
2206 /* It looks like this libseccomp does not know about fchmodat2().
2207 * Pretend the fchmodat2() system call is not supported at all,
2208 * regardless of the kernel version. */
2209 r = seccomp_rule_add_exact(
2210 seccomp,
2211 SCMP_ACT_ERRNO(ENOSYS),
2212 __NR_fchmodat2,
2213 0);
2214#endif
2215 if (r < 0)
2216 log_debug_errno(r, "Failed to add filter for fchmodat2: %m");
2217 else
2218 any = true;
2219
da4dc9a6
ZJS
2220 r = seccomp_rule_add_exact(
2221 seccomp,
2222 SCMP_ACT_ERRNO(EPERM),
2223 SCMP_SYS(mkdir),
2224 1,
2225 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2226 if (r < 0)
2227 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2228 else
2229 any = true;
2230
2231 r = seccomp_rule_add_exact(
2232 seccomp,
2233 SCMP_ACT_ERRNO(EPERM),
2234 SCMP_SYS(mkdirat),
2235 1,
2236 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2237 if (r < 0)
2238 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2239 else
2240 any = true;
2241
2242 r = seccomp_rule_add_exact(
2243 seccomp,
2244 SCMP_ACT_ERRNO(EPERM),
2245 SCMP_SYS(mknod),
2246 1,
2247 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2248 if (r < 0)
2249 log_debug_errno(r, "Failed to add filter for mknod: %m");
2250 else
2251 any = true;
2252
2253 r = seccomp_rule_add_exact(
2254 seccomp,
2255 SCMP_ACT_ERRNO(EPERM),
2256 SCMP_SYS(mknodat),
2257 1,
2258 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2259 if (r < 0)
2260 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2261 else
2262 any = true;
2263
da4dc9a6
ZJS
2264 r = seccomp_rule_add_exact(
2265 seccomp,
2266 SCMP_ACT_ERRNO(EPERM),
2267 SCMP_SYS(open),
2268 2,
2269 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2270 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2271 if (r < 0)
2272 log_debug_errno(r, "Failed to add filter for open: %m");
2273 else
2274 any = true;
da4dc9a6
ZJS
2275
2276 r = seccomp_rule_add_exact(
2277 seccomp,
2278 SCMP_ACT_ERRNO(EPERM),
2279 SCMP_SYS(openat),
2280 2,
2281 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2282 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2283 if (r < 0)
2284 log_debug_errno(r, "Failed to add filter for openat: %m");
2285 else
2286 any = true;
2287
ecc04067
LP
2288#if defined(__SNR_openat2)
2289 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2290 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2291 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
57353d29
MG
2292 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2293 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2294 * to call open() or openat() instead. We can properly enforce policy for those functions. */
ecc04067
LP
2295 r = seccomp_rule_add_exact(
2296 seccomp,
57353d29 2297 SCMP_ACT_ERRNO(ENOSYS),
ecc04067
LP
2298 SCMP_SYS(openat2),
2299 0);
2300 if (r < 0)
2301 log_debug_errno(r, "Failed to add filter for openat2: %m");
2302 else
2303 any = true;
2304#endif
2305
da4dc9a6
ZJS
2306 r = seccomp_rule_add_exact(
2307 seccomp,
2308 SCMP_ACT_ERRNO(EPERM),
2309 SCMP_SYS(creat),
2310 1,
2311 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2312 if (r < 0)
2313 log_debug_errno(r, "Failed to add filter for creat: %m");
2314 else
2315 any = true;
2316
2317 return any ? 0 : r;
2318}
2319
3c27973b
LP
2320int seccomp_restrict_suid_sgid(void) {
2321 uint32_t arch;
da4dc9a6 2322 int r, k;
3c27973b
LP
2323
2324 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2325 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2326
2327 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2328 if (r < 0)
2329 return r;
2330
da4dc9a6
ZJS
2331 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2332 if (r < 0)
3c098014
ZJS
2333 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m",
2334 seccomp_arch_to_string(arch));
3c27973b 2335
da4dc9a6
ZJS
2336 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2337 if (k < 0)
a539314a 2338 log_debug_errno(k, "Failed to add sgid rule for architecture %s, ignoring: %m",
3c098014 2339 seccomp_arch_to_string(arch));
3c27973b 2340
da4dc9a6 2341 if (r < 0 && k < 0)
3c27973b 2342 continue;
3c27973b
LP
2343
2344 r = seccomp_load(seccomp);
3c098014
ZJS
2345 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2346 return r;
2347 if (r < 0)
2348 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m",
2349 seccomp_arch_to_string(arch));
3c27973b
LP
2350 }
2351
2352 return 0;
2353}
915fb324
LP
2354
2355uint32_t scmp_act_kill_process(void) {
2356
2357 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2358 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2359 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2360 * for single-threaded apps does the right thing. */
2361
2362#ifdef SCMP_ACT_KILL_PROCESS
2363 if (seccomp_api_get() >= 3)
2364 return SCMP_ACT_KILL_PROCESS;
2365#endif
2366
2367 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2368}
22eadc28
YW
2369
2370int parse_syscall_and_errno(const char *in, char **name, int *error) {
2371 _cleanup_free_ char *n = NULL;
2372 char *p;
2373 int e = -1;
2374
2375 assert(in);
2376 assert(name);
2377 assert(error);
2378
2379 /*
2380 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2381 * If errno is omitted, then error is set to -1.
2382 * Empty syscall name is not allowed.
2383 * Here, we do not check that the syscall name is valid or not.
2384 */
2385
2386 p = strchr(in, ':');
2387 if (p) {
2388 e = seccomp_parse_errno_or_action(p + 1);
2389 if (e < 0)
2390 return e;
2391
2392 n = strndup(in, p - in);
2393 } else
2394 n = strdup(in);
2395
2396 if (!n)
2397 return -ENOMEM;
2398
2399 if (isempty(n))
2400 return -EINVAL;
2401
2402 *error = e;
2403 *name = TAKE_PTR(n);
2404
2405 return 0;
2406}
4a4654e0
LP
2407
2408static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2409 bool any = false;
2410 int r;
2411
2412 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2413 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2414
4a4654e0
LP
2415 r = seccomp_rule_add_exact(
2416 seccomp,
2417 SCMP_ACT_ERRNO(EINVAL),
2418 SCMP_SYS(open),
2419 1,
2420 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2421 if (r < 0)
2422 log_debug_errno(r, "Failed to add filter for open: %m");
2423 else
2424 any = true;
4a4654e0
LP
2425
2426 r = seccomp_rule_add_exact(
2427 seccomp,
2428 SCMP_ACT_ERRNO(EINVAL),
2429 SCMP_SYS(openat),
2430 1,
2431 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2432 if (r < 0)
2433 log_debug_errno(r, "Failed to add filter for openat: %m");
2434 else
2435 any = true;
2436
2437#if defined(__SNR_openat2)
2438 /* The new openat2() system call can't be filtered sensibly, see above. */
2439 r = seccomp_rule_add_exact(
2440 seccomp,
2441 SCMP_ACT_ERRNO(ENOSYS),
2442 SCMP_SYS(openat2),
2443 0);
2444 if (r < 0)
2445 log_debug_errno(r, "Failed to add filter for openat2: %m");
2446 else
2447 any = true;
2448#endif
2449
2450 return any ? 0 : r;
2451}
2452
2453int seccomp_suppress_sync(void) {
2454 uint32_t arch;
2455 int r;
2456
2457 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2458 * manageable, and also masks O_SYNC/O_DSYNC */
2459
2460 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2461 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
4a4654e0
LP
2462
2463 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2464 if (r < 0)
2465 return r;
2466
2467 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2468 int id;
2469
2470 id = seccomp_syscall_resolve_name(c);
2471 if (id == __NR_SCMP_ERROR) {
2472 log_debug("System call %s is not known, ignoring.", c);
2473 continue;
2474 }
2475
2476 r = seccomp_rule_add_exact(
2477 seccomp,
2478 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2479 id,
2480 0);
2481 if (r < 0)
2482 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2483 }
2484
2485 (void) block_open_flag(seccomp, O_SYNC);
2486#if O_DSYNC != O_SYNC
2487 (void) block_open_flag(seccomp, O_DSYNC);
2488#endif
2489
2490 r = seccomp_load(seccomp);
3c098014
ZJS
2491 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2492 return r;
2493 if (r < 0)
2494 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m",
2495 seccomp_arch_to_string(arch));
4a4654e0
LP
2496 }
2497
2498 return 0;
2499}