]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
man/systemd-sysext: list ephemeral/ephemeral-import in the list of options
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
57183d11 2
3c27973b 3#include <fcntl.h>
469830d1 4#include <linux/seccomp.h>
69a283c5 5#include <sched.h>
469830d1 6#include <sys/mman.h>
d347d902 7#include <sys/prctl.h>
469830d1 8#include <sys/shm.h>
3c27973b 9#include <sys/stat.h>
57183d11 10
469830d1 11#include "af-list.h"
add00535 12#include "alloc-util.h"
44aaddad 13#include "env-util.h"
d8b4d14d 14#include "errno-list.h"
93a1f792 15#include "log.h"
241b1577 16#include "namespace-util.h"
add00535 17#include "nsflags.h"
d8b4d14d 18#include "nulstr-util.h"
69a283c5 19#include "parse-util.h"
78e864e5 20#include "process-util.h"
cf0fbc49 21#include "seccomp-util.h"
b16bd535 22#include "set.h"
07630cea 23#include "string-util.h"
b16bd535 24#include "strv.h"
469830d1 25
69a283c5
DDM
26#if HAVE_SECCOMP
27
65976868
GDF
28/* This array will be modified at runtime as seccomp_restrict_archs is called. */
29uint32_t seccomp_local_archs[] = {
469830d1 30
6b000af4 31 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
32
33#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
34 SCMP_ARCH_X86,
35 SCMP_ARCH_X86_64,
f2d9751c
LP
36 SCMP_ARCH_X32, /* native */
37#elif defined(__x86_64__) && !defined(__ILP32__)
38 SCMP_ARCH_X86,
469830d1 39 SCMP_ARCH_X32,
f2d9751c
LP
40 SCMP_ARCH_X86_64, /* native */
41#elif defined(__i386__)
42 SCMP_ARCH_X86,
43#elif defined(__aarch64__)
469830d1 44 SCMP_ARCH_ARM,
f2d9751c
LP
45 SCMP_ARCH_AARCH64, /* native */
46#elif defined(__arm__)
47 SCMP_ARCH_ARM,
f9d3fb6b
XW
48#elif defined(__loongarch_lp64)
49 SCMP_ARCH_LOONGARCH64,
f2d9751c
LP
50#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS, /* native */
53#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 54 SCMP_ARCH_MIPS,
f2d9751c
LP
55 SCMP_ARCH_MIPSEL, /* native */
56#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
57 SCMP_ARCH_MIPSEL,
58 SCMP_ARCH_MIPS,
59 SCMP_ARCH_MIPSEL64N32,
469830d1 60 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
61 SCMP_ARCH_MIPSEL64,
62 SCMP_ARCH_MIPS64, /* native */
63#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
64 SCMP_ARCH_MIPS,
469830d1 65 SCMP_ARCH_MIPSEL,
f2d9751c
LP
66 SCMP_ARCH_MIPS64N32,
67 SCMP_ARCH_MIPSEL64N32,
68 SCMP_ARCH_MIPS64,
69 SCMP_ARCH_MIPSEL64, /* native */
70#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS,
469830d1 73 SCMP_ARCH_MIPSEL64,
f2d9751c 74 SCMP_ARCH_MIPS64,
469830d1 75 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
76 SCMP_ARCH_MIPS64N32, /* native */
77#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
78 SCMP_ARCH_MIPS,
79 SCMP_ARCH_MIPSEL,
80 SCMP_ARCH_MIPS64,
81 SCMP_ARCH_MIPSEL64,
82 SCMP_ARCH_MIPS64N32,
83 SCMP_ARCH_MIPSEL64N32, /* native */
344e6b62
SJ
84#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
85 SCMP_ARCH_PARISC,
86 SCMP_ARCH_PARISC64, /* native */
87#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
88 SCMP_ARCH_PARISC,
f2d9751c 89#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 90 SCMP_ARCH_PPC,
469830d1 91 SCMP_ARCH_PPC64LE,
f2d9751c
LP
92 SCMP_ARCH_PPC64, /* native */
93#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
94 SCMP_ARCH_PPC,
95 SCMP_ARCH_PPC64,
96 SCMP_ARCH_PPC64LE, /* native */
97#elif defined(__powerpc__)
98 SCMP_ARCH_PPC,
f9252236
AJ
99#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
100 SCMP_ARCH_RISCV64,
f2d9751c
LP
101#elif defined(__s390x__)
102 SCMP_ARCH_S390,
103 SCMP_ARCH_S390X, /* native */
104#elif defined(__s390__)
469830d1 105 SCMP_ARCH_S390,
469830d1 106#endif
65976868 107 SECCOMP_LOCAL_ARCH_END
469830d1 108 };
57183d11
LP
109
110const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
111 /* Maintain order used in <seccomp.h>.
112 *
113 * Names used here should be the same as those used for ConditionArchitecture=,
114 * except for "subarchitectures" like x32. */
57183d11 115
79893116 116 switch (c) {
aa34055f 117 case SCMP_ARCH_NATIVE:
57183d11 118 return "native";
aa34055f 119 case SCMP_ARCH_X86:
57183d11 120 return "x86";
aa34055f 121 case SCMP_ARCH_X86_64:
57183d11 122 return "x86-64";
aa34055f 123 case SCMP_ARCH_X32:
57183d11 124 return "x32";
aa34055f 125 case SCMP_ARCH_ARM:
57183d11 126 return "arm";
aa34055f
ZJS
127 case SCMP_ARCH_AARCH64:
128 return "arm64";
f9d3fb6b
XW
129#ifdef SCMP_ARCH_LOONGARCH64
130 case SCMP_ARCH_LOONGARCH64:
131 return "loongarch64";
132#endif
aa34055f
ZJS
133 case SCMP_ARCH_MIPS:
134 return "mips";
135 case SCMP_ARCH_MIPS64:
136 return "mips64";
137 case SCMP_ARCH_MIPS64N32:
138 return "mips64-n32";
139 case SCMP_ARCH_MIPSEL:
140 return "mips-le";
141 case SCMP_ARCH_MIPSEL64:
142 return "mips64-le";
143 case SCMP_ARCH_MIPSEL64N32:
144 return "mips64-le-n32";
344e6b62
SJ
145#ifdef SCMP_ARCH_PARISC
146 case SCMP_ARCH_PARISC:
147 return "parisc";
148#endif
149#ifdef SCMP_ARCH_PARISC64
150 case SCMP_ARCH_PARISC64:
151 return "parisc64";
152#endif
aa34055f
ZJS
153 case SCMP_ARCH_PPC:
154 return "ppc";
155 case SCMP_ARCH_PPC64:
156 return "ppc64";
157 case SCMP_ARCH_PPC64LE:
158 return "ppc64-le";
f9252236
AJ
159#ifdef SCMP_ARCH_RISCV64
160 case SCMP_ARCH_RISCV64:
161 return "riscv64";
162#endif
aa34055f 163 case SCMP_ARCH_S390:
6abfd303 164 return "s390";
aa34055f 165 case SCMP_ARCH_S390X:
6abfd303 166 return "s390x";
aa34055f
ZJS
167 default:
168 return NULL;
169 }
57183d11
LP
170}
171
172int seccomp_arch_from_string(const char *n, uint32_t *ret) {
173 if (!n)
174 return -EINVAL;
175
176 assert(ret);
177
178 if (streq(n, "native"))
179 *ret = SCMP_ARCH_NATIVE;
180 else if (streq(n, "x86"))
181 *ret = SCMP_ARCH_X86;
182 else if (streq(n, "x86-64"))
183 *ret = SCMP_ARCH_X86_64;
184 else if (streq(n, "x32"))
185 *ret = SCMP_ARCH_X32;
186 else if (streq(n, "arm"))
187 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
188 else if (streq(n, "arm64"))
189 *ret = SCMP_ARCH_AARCH64;
f9d3fb6b
XW
190#ifdef SCMP_ARCH_LOONGARCH64
191 else if (streq(n, "loongarch64"))
192 *ret = SCMP_ARCH_LOONGARCH64;
193#endif
aa34055f
ZJS
194 else if (streq(n, "mips"))
195 *ret = SCMP_ARCH_MIPS;
196 else if (streq(n, "mips64"))
197 *ret = SCMP_ARCH_MIPS64;
198 else if (streq(n, "mips64-n32"))
199 *ret = SCMP_ARCH_MIPS64N32;
200 else if (streq(n, "mips-le"))
201 *ret = SCMP_ARCH_MIPSEL;
202 else if (streq(n, "mips64-le"))
203 *ret = SCMP_ARCH_MIPSEL64;
204 else if (streq(n, "mips64-le-n32"))
205 *ret = SCMP_ARCH_MIPSEL64N32;
344e6b62
SJ
206#ifdef SCMP_ARCH_PARISC
207 else if (streq(n, "parisc"))
208 *ret = SCMP_ARCH_PARISC;
209#endif
210#ifdef SCMP_ARCH_PARISC64
211 else if (streq(n, "parisc64"))
212 *ret = SCMP_ARCH_PARISC64;
213#endif
aa34055f
ZJS
214 else if (streq(n, "ppc"))
215 *ret = SCMP_ARCH_PPC;
216 else if (streq(n, "ppc64"))
217 *ret = SCMP_ARCH_PPC64;
218 else if (streq(n, "ppc64-le"))
219 *ret = SCMP_ARCH_PPC64LE;
f9252236
AJ
220#ifdef SCMP_ARCH_RISCV64
221 else if (streq(n, "riscv64"))
222 *ret = SCMP_ARCH_RISCV64;
223#endif
6abfd303
HB
224 else if (streq(n, "s390"))
225 *ret = SCMP_ARCH_S390;
226 else if (streq(n, "s390x"))
227 *ret = SCMP_ARCH_S390X;
57183d11
LP
228 else
229 return -EINVAL;
230
231 return 0;
232}
e9642be2 233
469830d1 234int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 235 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
236 int r;
237
469830d1
LP
238 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
239 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
240
241 seccomp = seccomp_init(default_action);
242 if (!seccomp)
243 return -ENOMEM;
244
469830d1
LP
245 if (arch != SCMP_ARCH_NATIVE &&
246 arch != seccomp_arch_native()) {
247
1b52793d 248 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 249 if (r < 0)
b4eaa6cc 250 return r;
469830d1 251
1b52793d 252 r = seccomp_arch_add(seccomp, arch);
469830d1 253 if (r < 0)
b4eaa6cc 254 return r;
469830d1
LP
255
256 assert(seccomp_arch_exist(seccomp, arch) >= 0);
257 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
258 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
259 } else {
260 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
261 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
262 }
263
264 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 265 if (r < 0)
b4eaa6cc 266 return r;
8d7b0c8f
LP
267
268 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
269 if (r < 0)
b4eaa6cc 270 return r;
8d7b0c8f 271
44aaddad
SD
272#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
273 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
274 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
275 if (r < 0)
276 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
277 }
278#endif
279
b4eaa6cc 280 *ret = TAKE_PTR(seccomp);
8d7b0c8f 281 return 0;
8d7b0c8f
LP
282}
283
d347d902 284static bool is_basic_seccomp_available(void) {
4d5bd50a 285 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
286}
287
288static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
289 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
290 errno == EFAULT;
d347d902
FS
291}
292
83f12b27 293bool is_seccomp_available(void) {
83f12b27 294 static int cached_enabled = -1;
4d5bd50a 295
ce8f6d47
LP
296 if (cached_enabled < 0) {
297 int b;
298
efb9b3ba 299 b = secure_getenv_bool("SYSTEMD_SECCOMP");
ce8f6d47
LP
300 if (b != 0) {
301 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
302 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
303
304 cached_enabled =
305 is_basic_seccomp_available() &&
306 is_seccomp_filter_available();
307 } else
308 cached_enabled = false;
309 }
4d5bd50a 310
83f12b27
FS
311 return cached_enabled;
312}
313
8130926d 314const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 315 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 316 .name = "@default",
d5efc18b 317 .help = "System calls that are always permitted",
40eb6a80 318 .value =
e9966634 319 "@sandbox\0"
5f02870a 320 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
5abede32 321 "brk\0"
8e24b1d2 322 "cacheflush\0"
40eb6a80 323 "clock_getres\0"
6ca67710 324 "clock_getres_time64\0"
40eb6a80 325 "clock_gettime\0"
6ca67710 326 "clock_gettime64\0"
40eb6a80 327 "clock_nanosleep\0"
6ca67710 328 "clock_nanosleep_time64\0"
40eb6a80
ZJS
329 "execve\0"
330 "exit\0"
331 "exit_group\0"
e41b0f42 332 "futex\0"
6ca67710 333 "futex_time64\0"
76e86b8d 334 "futex_waitv\0"
e41b0f42
LP
335 "get_robust_list\0"
336 "get_thread_area\0"
09d3020b
DH
337 "getegid\0"
338 "getegid32\0"
339 "geteuid\0"
340 "geteuid32\0"
341 "getgid\0"
342 "getgid32\0"
343 "getgroups\0"
344 "getgroups32\0"
345 "getpgid\0"
346 "getpgrp\0"
347 "getpid\0"
348 "getppid\0"
14f4b1b5 349 "getrandom\0"
09d3020b
DH
350 "getresgid\0"
351 "getresgid32\0"
352 "getresuid\0"
353 "getresuid32\0"
40eb6a80 354 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
355 "getsid\0"
356 "gettid\0"
40eb6a80 357 "gettimeofday\0"
09d3020b
DH
358 "getuid\0"
359 "getuid32\0"
e41b0f42 360 "membarrier\0"
5abede32
LP
361 "mmap\0"
362 "mmap2\0"
47286254 363 "mprotect\0"
626df2fe 364 "mseal\0"
11b9105d 365 "munmap\0"
40eb6a80
ZJS
366 "nanosleep\0"
367 "pause\0"
4c3a9176 368 "prlimit64\0"
e41b0f42 369 "restart_syscall\0"
09925036 370 "riscv_flush_icache\0"
ca15fc48 371 "riscv_hwprobe\0"
6fee3be0 372 "rseq\0"
40eb6a80 373 "rt_sigreturn\0"
7df660e4 374 "sched_getaffinity\0"
8f44de08 375 "sched_yield\0"
e41b0f42
LP
376 "set_robust_list\0"
377 "set_thread_area\0"
378 "set_tid_address\0"
ce5faeac 379 "set_tls\0"
40eb6a80
ZJS
380 "sigreturn\0"
381 "time\0"
4c3a9176 382 "ugetrlimit\0"
d693c483 383 "uretprobe\0"
40eb6a80 384 },
44898c53
LP
385 [SYSCALL_FILTER_SET_AIO] = {
386 .name = "@aio",
387 .help = "Asynchronous IO",
388 .value =
389 "io_cancel\0"
390 "io_destroy\0"
391 "io_getevents\0"
a05cfe23 392 "io_pgetevents\0"
6ca67710 393 "io_pgetevents_time64\0"
44898c53
LP
394 "io_setup\0"
395 "io_submit\0"
9e486265
LP
396 "io_uring_enter\0"
397 "io_uring_register\0"
398 "io_uring_setup\0"
44898c53 399 },
133ddbbe 400 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 401 .name = "@basic-io",
d5efc18b 402 .help = "Basic IO",
133ddbbe 403 .value =
648a0ed0 404 "_llseek\0"
133ddbbe 405 "close\0"
6ea0d25c 406 "close_range\0"
648a0ed0 407 "dup\0"
133ddbbe
LP
408 "dup2\0"
409 "dup3\0"
d361ea5f 410 "llseek\0"
133ddbbe
LP
411 "lseek\0"
412 "pread64\0"
413 "preadv\0"
44898c53 414 "preadv2\0"
133ddbbe
LP
415 "pwrite64\0"
416 "pwritev\0"
44898c53 417 "pwritev2\0"
133ddbbe
LP
418 "read\0"
419 "readv\0"
420 "write\0"
421 "writev\0"
422 },
44898c53
LP
423 [SYSCALL_FILTER_SET_CHOWN] = {
424 .name = "@chown",
425 .help = "Change ownership of files and directories",
426 .value =
427 "chown\0"
428 "chown32\0"
429 "fchown\0"
430 "fchown32\0"
431 "fchownat\0"
432 "lchown\0"
433 "lchown32\0"
434 },
8130926d 435 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 436 .name = "@clock",
d5efc18b 437 .help = "Change the system time",
201c1cc2
TM
438 .value =
439 "adjtimex\0"
1f9ac68b 440 "clock_adjtime\0"
6ca67710 441 "clock_adjtime64\0"
1f9ac68b 442 "clock_settime\0"
6ca67710 443 "clock_settime64\0"
201c1cc2 444 "settimeofday\0"
8130926d
LP
445 },
446 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 447 .name = "@cpu-emulation",
d5efc18b 448 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
449 .value =
450 "modify_ldt\0"
451 "subpage_prot\0"
452 "switch_endian\0"
453 "vm86\0"
454 "vm86old\0"
8130926d
LP
455 },
456 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 457 .name = "@debug",
d5efc18b 458 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
459 .value =
460 "lookup_dcookie\0"
461 "perf_event_open\0"
8270e3d8 462 "pidfd_getfd\0"
1f9ac68b
LP
463 "ptrace\0"
464 "rtas\0"
465 "s390_runtime_instr\0"
466 "sys_debug_setcontext\0"
8130926d 467 },
1a1b13c9
LP
468 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
469 .name = "@file-system",
470 .help = "File system operations",
471 .value =
472 "access\0"
473 "chdir\0"
474 "chmod\0"
475 "close\0"
476 "creat\0"
477 "faccessat\0"
bcf08acb 478 "faccessat2\0"
1a1b13c9
LP
479 "fallocate\0"
480 "fchdir\0"
481 "fchmod\0"
482 "fchmodat\0"
6e10405a 483 "fchmodat2\0"
1a1b13c9 484 "fcntl\0"
ceaa6aa7 485 "fcntl64\0"
1a1b13c9
LP
486 "fgetxattr\0"
487 "flistxattr\0"
ceaa6aa7 488 "fremovexattr\0"
1a1b13c9 489 "fsetxattr\0"
1a1b13c9 490 "fstat\0"
ceaa6aa7 491 "fstat64\0"
d361ea5f 492 "fstatat\0"
1a1b13c9 493 "fstatat64\0"
1a1b13c9 494 "fstatfs\0"
ceaa6aa7 495 "fstatfs64\0"
1a1b13c9 496 "ftruncate\0"
ceaa6aa7 497 "ftruncate64\0"
1a1b13c9
LP
498 "futimesat\0"
499 "getcwd\0"
1a1b13c9 500 "getdents\0"
ceaa6aa7 501 "getdents64\0"
1a1b13c9 502 "getxattr\0"
fb35d775 503 "getxattrat\0"
1a1b13c9 504 "inotify_add_watch\0"
ceaa6aa7 505 "inotify_init\0"
1a1b13c9
LP
506 "inotify_init1\0"
507 "inotify_rm_watch\0"
508 "lgetxattr\0"
509 "link\0"
510 "linkat\0"
626df2fe 511 "listmount\0"
1a1b13c9 512 "listxattr\0"
fb35d775 513 "listxattrat\0"
1a1b13c9
LP
514 "llistxattr\0"
515 "lremovexattr\0"
516 "lsetxattr\0"
1a1b13c9 517 "lstat\0"
ceaa6aa7 518 "lstat64\0"
1a1b13c9
LP
519 "mkdir\0"
520 "mkdirat\0"
521 "mknod\0"
522 "mknodat\0"
d361ea5f 523 "newfstat\0"
1a1b13c9 524 "newfstatat\0"
ceaa6aa7
LP
525 "oldfstat\0"
526 "oldlstat\0"
527 "oldstat\0"
1a1b13c9
LP
528 "open\0"
529 "openat\0"
8270e3d8 530 "openat2\0"
1a1b13c9
LP
531 "readlink\0"
532 "readlinkat\0"
533 "removexattr\0"
fb35d775 534 "removexattrat\0"
1a1b13c9 535 "rename\0"
1a1b13c9 536 "renameat\0"
ceaa6aa7 537 "renameat2\0"
1a1b13c9
LP
538 "rmdir\0"
539 "setxattr\0"
fb35d775 540 "setxattrat\0"
1a1b13c9 541 "stat\0"
ceaa6aa7 542 "stat64\0"
1a1b13c9 543 "statfs\0"
ceaa6aa7 544 "statfs64\0"
626df2fe 545 "statmount\0"
a4135a74 546 "statx\0"
1a1b13c9
LP
547 "symlink\0"
548 "symlinkat\0"
1a1b13c9 549 "truncate\0"
ceaa6aa7 550 "truncate64\0"
1a1b13c9
LP
551 "unlink\0"
552 "unlinkat\0"
ceaa6aa7 553 "utime\0"
1a1b13c9 554 "utimensat\0"
6ca67710 555 "utimensat_time64\0"
1a1b13c9
LP
556 "utimes\0"
557 },
8130926d 558 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 559 .name = "@io-event",
d5efc18b 560 .help = "Event loop system calls",
201c1cc2
TM
561 .value =
562 "_newselect\0"
201c1cc2 563 "epoll_create\0"
215728ff 564 "epoll_create1\0"
201c1cc2
TM
565 "epoll_ctl\0"
566 "epoll_ctl_old\0"
567 "epoll_pwait\0"
34254e59 568 "epoll_pwait2\0"
201c1cc2
TM
569 "epoll_wait\0"
570 "epoll_wait_old\0"
201c1cc2 571 "eventfd\0"
215728ff 572 "eventfd2\0"
201c1cc2
TM
573 "poll\0"
574 "ppoll\0"
6ca67710 575 "ppoll_time64\0"
201c1cc2 576 "pselect6\0"
6ca67710 577 "pselect6_time64\0"
201c1cc2 578 "select\0"
8130926d
LP
579 },
580 [SYSCALL_FILTER_SET_IPC] = {
8130926d 581 .name = "@ipc",
d5efc18b
ZJS
582 .help = "SysV IPC, POSIX Message Queues or other IPC",
583 .value =
584 "ipc\0"
cd5bfd7e 585 "memfd_create\0"
201c1cc2
TM
586 "mq_getsetattr\0"
587 "mq_notify\0"
588 "mq_open\0"
589 "mq_timedreceive\0"
6ca67710 590 "mq_timedreceive_time64\0"
201c1cc2 591 "mq_timedsend\0"
6ca67710 592 "mq_timedsend_time64\0"
201c1cc2
TM
593 "mq_unlink\0"
594 "msgctl\0"
595 "msgget\0"
596 "msgrcv\0"
597 "msgsnd\0"
cd5bfd7e 598 "pipe\0"
215728ff 599 "pipe2\0"
34254e59 600 "process_madvise\0"
201c1cc2
TM
601 "process_vm_readv\0"
602 "process_vm_writev\0"
603 "semctl\0"
604 "semget\0"
605 "semop\0"
606 "semtimedop\0"
6ca67710 607 "semtimedop_time64\0"
201c1cc2
TM
608 "shmat\0"
609 "shmctl\0"
610 "shmdt\0"
611 "shmget\0"
8130926d
LP
612 },
613 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 614 .name = "@keyring",
d5efc18b 615 .help = "Kernel keyring access",
1f9ac68b
LP
616 .value =
617 "add_key\0"
618 "keyctl\0"
619 "request_key\0"
8130926d 620 },
cd0ddf6f
LP
621 [SYSCALL_FILTER_SET_MEMLOCK] = {
622 .name = "@memlock",
623 .help = "Memory locking control",
624 .value =
625 "mlock\0"
626 "mlock2\0"
627 "mlockall\0"
628 "munlock\0"
629 "munlockall\0"
630 },
8130926d 631 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 632 .name = "@module",
d5efc18b 633 .help = "Loading and unloading of kernel modules",
201c1cc2 634 .value =
201c1cc2
TM
635 "delete_module\0"
636 "finit_module\0"
637 "init_module\0"
8130926d
LP
638 },
639 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 640 .name = "@mount",
d5efc18b 641 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
642 .value =
643 "chroot\0"
9e486265
LP
644 "fsconfig\0"
645 "fsmount\0"
646 "fsopen\0"
647 "fspick\0"
201c1cc2 648 "mount\0"
34254e59 649 "mount_setattr\0"
9e486265
LP
650 "move_mount\0"
651 "open_tree\0"
38704f5d 652 "open_tree_attr\0"
201c1cc2 653 "pivot_root\0"
201c1cc2 654 "umount\0"
215728ff 655 "umount2\0"
8130926d
LP
656 },
657 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 658 .name = "@network-io",
d5efc18b 659 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 660 .value =
201c1cc2 661 "accept\0"
215728ff 662 "accept4\0"
201c1cc2
TM
663 "bind\0"
664 "connect\0"
665 "getpeername\0"
666 "getsockname\0"
667 "getsockopt\0"
668 "listen\0"
669 "recv\0"
670 "recvfrom\0"
671 "recvmmsg\0"
6ca67710 672 "recvmmsg_time64\0"
201c1cc2
TM
673 "recvmsg\0"
674 "send\0"
675 "sendmmsg\0"
676 "sendmsg\0"
677 "sendto\0"
678 "setsockopt\0"
679 "shutdown\0"
680 "socket\0"
681 "socketcall\0"
682 "socketpair\0"
8130926d
LP
683 },
684 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 685 /* some unknown even to libseccomp */
8130926d 686 .name = "@obsolete",
d5efc18b 687 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
688 .value =
689 "_sysctl\0"
690 "afs_syscall\0"
802fa07a 691 "bdflush\0"
201c1cc2 692 "break\0"
1f9ac68b 693 "create_module\0"
201c1cc2
TM
694 "ftime\0"
695 "get_kernel_syms\0"
201c1cc2
TM
696 "getpmsg\0"
697 "gtty\0"
7e0c3b8f 698 "idle\0"
201c1cc2 699 "lock\0"
201c1cc2 700 "mpx\0"
201c1cc2
TM
701 "prof\0"
702 "profil\0"
201c1cc2
TM
703 "putpmsg\0"
704 "query_module\0"
201c1cc2
TM
705 "security\0"
706 "sgetmask\0"
707 "ssetmask\0"
ae5e9bf4 708 "stime\0"
201c1cc2 709 "stty\0"
1f9ac68b 710 "sysfs\0"
201c1cc2
TM
711 "tuxcall\0"
712 "ulimit\0"
713 "uselib\0"
1f9ac68b 714 "ustat\0"
201c1cc2 715 "vserver\0"
8130926d 716 },
9493b168
ZJS
717 [SYSCALL_FILTER_SET_PKEY] = {
718 .name = "@pkey",
719 .help = "System calls used for memory protection keys",
720 .value =
721 "pkey_alloc\0"
722 "pkey_free\0"
723 "pkey_mprotect\0"
724 },
8130926d 725 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 726 .name = "@privileged",
d5efc18b 727 .help = "All system calls which need super-user capabilities",
201c1cc2 728 .value =
44898c53 729 "@chown\0"
201c1cc2
TM
730 "@clock\0"
731 "@module\0"
732 "@raw-io\0"
af0f047b
LP
733 "@reboot\0"
734 "@swap\0"
215728ff 735 "_sysctl\0"
201c1cc2 736 "acct\0"
201c1cc2 737 "bpf\0"
1f9ac68b 738 "capset\0"
201c1cc2 739 "chroot\0"
a05cfe23 740 "fanotify_init\0"
9e486265 741 "fanotify_mark\0"
201c1cc2 742 "nfsservctl\0"
a05cfe23 743 "open_by_handle_at\0"
201c1cc2
TM
744 "pivot_root\0"
745 "quotactl\0"
76e86b8d 746 "quotactl_fd\0"
201c1cc2 747 "setdomainname\0"
201c1cc2 748 "setfsuid\0"
215728ff 749 "setfsuid32\0"
201c1cc2 750 "setgroups\0"
215728ff 751 "setgroups32\0"
201c1cc2 752 "sethostname\0"
201c1cc2 753 "setresuid\0"
215728ff 754 "setresuid32\0"
201c1cc2 755 "setreuid\0"
215728ff 756 "setreuid32\0"
e05ee49b 757 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 758 "setuid32\0"
201c1cc2 759 "vhangup\0"
8130926d
LP
760 },
761 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 762 .name = "@process",
7b121df6 763 .help = "Process control, execution, namespacing operations",
201c1cc2 764 .value =
09d3020b 765 "capget\0" /* Able to query arbitrary processes */
201c1cc2 766 "clone\0"
c5503601
ZJS
767 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
768 * implement seccomp, so we don't need to list it at all. C.f.
769 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
9e486265 770 "clone3\0"
201c1cc2
TM
771 "execveat\0"
772 "fork\0"
b887d2eb 773 "getrusage\0"
201c1cc2 774 "kill\0"
9e486265 775 "pidfd_open\0"
46fcf95d 776 "pidfd_send_signal\0"
201c1cc2 777 "prctl\0"
b887d2eb
LP
778 "rt_sigqueueinfo\0"
779 "rt_tgsigqueueinfo\0"
201c1cc2 780 "setns\0"
a9518dc3 781 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 782 "tgkill\0"
b887d2eb 783 "times\0"
201c1cc2
TM
784 "tkill\0"
785 "unshare\0"
786 "vfork\0"
b887d2eb
LP
787 "wait4\0"
788 "waitid\0"
789 "waitpid\0"
8130926d
LP
790 },
791 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 792 .name = "@raw-io",
d5efc18b 793 .help = "Raw I/O port access",
201c1cc2
TM
794 .value =
795 "ioperm\0"
796 "iopl\0"
1f9ac68b 797 "pciconfig_iobase\0"
201c1cc2
TM
798 "pciconfig_read\0"
799 "pciconfig_write\0"
800 "s390_pci_mmio_read\0"
801 "s390_pci_mmio_write\0"
8130926d 802 },
bd2ab3f4
LP
803 [SYSCALL_FILTER_SET_REBOOT] = {
804 .name = "@reboot",
805 .help = "Reboot and reboot preparation/kexec",
806 .value =
bd2ab3f4 807 "kexec_file_load\0"
e59608fa 808 "kexec_load\0"
bd2ab3f4
LP
809 "reboot\0"
810 },
133ddbbe 811 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 812 .name = "@resources",
58a8f68b 813 .help = "Alter resource settings",
133ddbbe 814 .value =
0963c053
LP
815 "ioprio_set\0"
816 "mbind\0"
817 "migrate_pages\0"
818 "move_pages\0"
819 "nice\0"
0963c053
LP
820 "sched_setaffinity\0"
821 "sched_setattr\0"
133ddbbe
LP
822 "sched_setparam\0"
823 "sched_setscheduler\0"
0963c053 824 "set_mempolicy\0"
76e86b8d 825 "set_mempolicy_home_node\0"
133ddbbe
LP
826 "setpriority\0"
827 "setrlimit\0"
133ddbbe 828 },
d12632a8
LP
829 [SYSCALL_FILTER_SET_SANDBOX] = {
830 .name = "@sandbox",
831 .help = "Sandbox functionality",
832 .value =
833 "landlock_add_rule\0"
834 "landlock_create_ruleset\0"
835 "landlock_restrict_self\0"
836 "seccomp\0"
837 },
6eaaeee9
LP
838 [SYSCALL_FILTER_SET_SETUID] = {
839 .name = "@setuid",
840 .help = "Operations for changing user/group credentials",
841 .value =
6eaaeee9 842 "setgid\0"
215728ff 843 "setgid32\0"
6eaaeee9 844 "setgroups\0"
215728ff 845 "setgroups32\0"
6eaaeee9 846 "setregid\0"
215728ff 847 "setregid32\0"
6eaaeee9 848 "setresgid\0"
215728ff 849 "setresgid32\0"
6eaaeee9 850 "setresuid\0"
215728ff 851 "setresuid32\0"
6eaaeee9 852 "setreuid\0"
215728ff 853 "setreuid32\0"
6eaaeee9 854 "setuid\0"
215728ff 855 "setuid32\0"
6eaaeee9 856 },
cd0ddf6f
LP
857 [SYSCALL_FILTER_SET_SIGNAL] = {
858 .name = "@signal",
859 .help = "Process signal handling",
860 .value =
861 "rt_sigaction\0"
862 "rt_sigpending\0"
863 "rt_sigprocmask\0"
864 "rt_sigsuspend\0"
865 "rt_sigtimedwait\0"
6ca67710 866 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
867 "sigaction\0"
868 "sigaltstack\0"
869 "signal\0"
870 "signalfd\0"
871 "signalfd4\0"
872 "sigpending\0"
873 "sigprocmask\0"
874 "sigsuspend\0"
875 },
bd2ab3f4
LP
876 [SYSCALL_FILTER_SET_SWAP] = {
877 .name = "@swap",
878 .help = "Enable/disable swap devices",
879 .value =
880 "swapoff\0"
881 "swapon\0"
882 },
44898c53
LP
883 [SYSCALL_FILTER_SET_SYNC] = {
884 .name = "@sync",
885 .help = "Synchronize files and memory to storage",
886 .value =
144fbbac 887 /* Please also update the list in seccomp_suppress_sync(). */
44898c53
LP
888 "fdatasync\0"
889 "fsync\0"
890 "msync\0"
891 "sync\0"
892 "sync_file_range\0"
a8fb09f5 893 "sync_file_range2\0"
44898c53
LP
894 "syncfs\0"
895 },
70526841
LP
896 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
897 .name = "@system-service",
898 .help = "General system service operations",
899 .value =
900 "@aio\0"
901 "@basic-io\0"
902 "@chown\0"
903 "@default\0"
904 "@file-system\0"
905 "@io-event\0"
906 "@ipc\0"
907 "@keyring\0"
908 "@memlock\0"
909 "@network-io\0"
910 "@process\0"
911 "@resources\0"
912 "@setuid\0"
913 "@signal\0"
914 "@sync\0"
915 "@timer\0"
26b682e8 916 "arm_fadvise64_64\0"
70526841
LP
917 "capget\0"
918 "capset\0"
919 "copy_file_range\0"
920 "fadvise64\0"
921 "fadvise64_64\0"
922 "flock\0"
923 "get_mempolicy\0"
924 "getcpu\0"
925 "getpriority\0"
70526841
LP
926 "ioctl\0"
927 "ioprio_get\0"
928 "kcmp\0"
929 "madvise\0"
70526841
LP
930 "mremap\0"
931 "name_to_handle_at\0"
932 "oldolduname\0"
933 "olduname\0"
934 "personality\0"
935 "readahead\0"
936 "readdir\0"
937 "remap_file_pages\0"
938 "sched_get_priority_max\0"
939 "sched_get_priority_min\0"
70526841
LP
940 "sched_getattr\0"
941 "sched_getparam\0"
942 "sched_getscheduler\0"
943 "sched_rr_get_interval\0"
6ca67710 944 "sched_rr_get_interval_time64\0"
70526841
LP
945 "sched_yield\0"
946 "sendfile\0"
947 "sendfile64\0"
948 "setfsgid\0"
949 "setfsgid32\0"
950 "setfsuid\0"
951 "setfsuid32\0"
952 "setpgid\0"
953 "setsid\0"
954 "splice\0"
955 "sysinfo\0"
956 "tee\0"
957 "umask\0"
958 "uname\0"
959 "userfaultfd\0"
960 "vmsplice\0"
961 },
cd0ddf6f
LP
962 [SYSCALL_FILTER_SET_TIMER] = {
963 .name = "@timer",
964 .help = "Schedule operations by time",
965 .value =
966 "alarm\0"
967 "getitimer\0"
968 "setitimer\0"
969 "timer_create\0"
970 "timer_delete\0"
971 "timer_getoverrun\0"
972 "timer_gettime\0"
6ca67710 973 "timer_gettime64\0"
cd0ddf6f 974 "timer_settime\0"
6ca67710 975 "timer_settime64\0"
cd0ddf6f
LP
976 "timerfd_create\0"
977 "timerfd_gettime\0"
6ca67710 978 "timerfd_gettime64\0"
cd0ddf6f 979 "timerfd_settime\0"
6ca67710 980 "timerfd_settime64\0"
cd0ddf6f
LP
981 "times\0"
982 },
95aac012
ZJS
983 [SYSCALL_FILTER_SET_KNOWN] = {
984 .name = "@known",
985 .help = "All known syscalls declared in the kernel",
986 .value =
6d6a0854 987 "@obsolete\0"
5ea01af2 988#include "syscall-list.inc"
95aac012 989 },
201c1cc2 990};
8130926d
LP
991
992const SyscallFilterSet *syscall_filter_set_find(const char *name) {
8130926d
LP
993 if (isempty(name) || name[0] != '@')
994 return NULL;
995
077e8fc0 996 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
8130926d
LP
997 if (streq(syscall_filter_sets[i].name, name))
998 return syscall_filter_sets + i;
999
1000 return NULL;
1001}
1002
000c0520
ZJS
1003static int add_syscall_filter_set(
1004 scmp_filter_ctx seccomp,
1005 const SyscallFilterSet *set,
1006 uint32_t action,
1007 char **exclude,
1008 bool log_missing,
1009 char ***added);
1010
1011int seccomp_add_syscall_filter_item(
1012 scmp_filter_ctx *seccomp,
1013 const char *name,
1014 uint32_t action,
1015 char **exclude,
1016 bool log_missing,
1017 char ***added) {
69b1b241
LP
1018
1019 assert(seccomp);
1020 assert(name);
1021
960e4569
LP
1022 if (strv_contains(exclude, name))
1023 return 0;
1024
000c0520
ZJS
1025 /* Any syscalls that are handled are added to the *added strv. The pointer
1026 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
1027
69b1b241
LP
1028 if (name[0] == '@') {
1029 const SyscallFilterSet *other;
1030
1031 other = syscall_filter_set_find(name);
baaa35ad
ZJS
1032 if (!other)
1033 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
1034 "Filter set %s is not known!",
1035 name);
69b1b241 1036
000c0520 1037 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
b54f36c6 1038
69b1b241 1039 } else {
b54f36c6 1040 int id, r;
69b1b241
LP
1041
1042 id = seccomp_syscall_resolve_name(name);
cff7bff8 1043 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
1044 if (log_missing)
1045 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 1046 return 0;
cff7bff8 1047 }
69b1b241
LP
1048
1049 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 1050 if (r < 0) {
69b1b241 1051 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
1052 bool ignore = r == -EDOM;
1053
1054 if (!ignore || log_missing)
1055 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1056 name, id, ignore ? ", ignoring" : "");
1057 if (!ignore)
1058 return r;
b54f36c6 1059 }
69b1b241 1060
000c0520
ZJS
1061 if (added) {
1062 r = strv_extend(added, name);
1063 if (r < 0)
1064 return r;
1065 }
1066
b54f36c6
ZJS
1067 return 0;
1068 }
69b1b241
LP
1069}
1070
000c0520 1071static int add_syscall_filter_set(
469830d1 1072 scmp_filter_ctx seccomp,
469830d1 1073 const SyscallFilterSet *set,
960e4569 1074 uint32_t action,
b54f36c6 1075 char **exclude,
000c0520
ZJS
1076 bool log_missing,
1077 char ***added) {
469830d1 1078
8130926d
LP
1079 int r;
1080
000c0520
ZJS
1081 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1082
8130926d
LP
1083 assert(seccomp);
1084 assert(set);
1085
1086 NULSTR_FOREACH(sys, set->value) {
000c0520 1087 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
69b1b241
LP
1088 if (r < 0)
1089 return r;
469830d1
LP
1090 }
1091
1092 return 0;
1093}
1094
03c0730f
YW
1095static uint32_t override_default_action(uint32_t default_action) {
1096 /* When the requested filter is an allow-list, and the default action is something critical, we
1097 * install ENOSYS as the default action, but it will only apply to syscalls which are not in the
1098 * @known set. */
1099
1100 if (default_action == SCMP_ACT_ALLOW)
1101 return default_action;
1102
1103#ifdef SCMP_ACT_LOG
1104 if (default_action == SCMP_ACT_LOG)
1105 return default_action;
1106#endif
1107
1108 return SCMP_ACT_ERRNO(ENOSYS);
1109}
1110
b54f36c6 1111int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
03c0730f 1112 uint32_t arch, default_action_override;
469830d1
LP
1113 int r;
1114
1115 assert(set);
1116
1117 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 1118 * each local arch. */
469830d1 1119
78b2ad7d
YW
1120 default_action_override = override_default_action(default_action);
1121
469830d1
LP
1122 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1123 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78b2ad7d 1124 _cleanup_strv_free_ char **added = NULL;
469830d1 1125
30868c1c 1126 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1127
78b2ad7d 1128 r = seccomp_init_for_arch(&seccomp, arch, default_action_override);
8130926d
LP
1129 if (r < 0)
1130 return r;
469830d1 1131
78b2ad7d 1132 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, &added);
7e86bd73
ZJS
1133 if (r < 0)
1134 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1 1135
78b2ad7d
YW
1136 if (default_action != default_action_override)
1137 NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
1138 int id;
1139
1140 id = seccomp_syscall_resolve_name(name);
1141 if (id < 0)
1142 continue;
1143
1144 /* Ignore the syscall if it was already handled above */
1145 if (strv_contains(added, name))
1146 continue;
1147
1148 r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
1149 if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */
1150 return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
1151 name, id);
1152 }
1153
1154#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
1155 /* We have a large filter here, so let's turn on the binary tree mode if possible. */
1156 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2);
1157 if (r < 0)
1158 log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
1159#endif
1160
469830d1 1161 r = seccomp_load(seccomp);
3c098014
ZJS
1162 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1163 return r;
1164 if (r < 0)
1165 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m",
1166 seccomp_arch_to_string(arch));
8130926d
LP
1167 }
1168
1169 return 0;
1170}
a3be2849 1171
1862b310 1172int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
03c0730f 1173 uint32_t arch, default_action_override;
a3be2849
LP
1174 int r;
1175
1862b310
YW
1176 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1177 * of a SyscallFilterSet* table. */
a3be2849 1178
1862b310 1179 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
469830d1 1180 return 0;
a3be2849 1181
03c0730f
YW
1182 default_action_override = override_default_action(default_action);
1183
469830d1
LP
1184 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1185 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b54f36c6 1186 void *syscall_id, *val;
a3be2849 1187
30868c1c 1188 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1189
03c0730f 1190 r = seccomp_init_for_arch(&seccomp, arch, default_action_override);
469830d1
LP
1191 if (r < 0)
1192 return r;
a3be2849 1193
1862b310 1194 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
8cfa775f 1195 uint32_t a = action;
b54f36c6
ZJS
1196 int id = PTR_TO_INT(syscall_id) - 1;
1197 int error = PTR_TO_INT(val);
8cfa775f 1198
005bfaf1
TM
1199 if (error == SECCOMP_ERROR_NUMBER_KILL)
1200 a = scmp_act_kill_process();
9df2cdd8
TM
1201#ifdef SCMP_ACT_LOG
1202 else if (action == SCMP_ACT_LOG)
1203 a = SCMP_ACT_LOG;
1204#endif
68acc1af 1205 else if (error >= 0)
b54f36c6 1206 a = SCMP_ACT_ERRNO(error);
8cfa775f 1207
b54f36c6 1208 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1 1209 if (r < 0) {
1862b310
YW
1210 /* If the system call is not known on this architecture, then that's
1211 * fine, let's ignore it */
469830d1 1212 _cleanup_free_ char *n = NULL;
7e86bd73 1213 bool ignore;
469830d1 1214
b54f36c6 1215 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1216 ignore = r == -EDOM;
1217 if (!ignore || log_missing)
1218 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1219 strna(n), id, ignore ? ", ignoring" : "");
1220 if (!ignore)
1221 return r;
469830d1
LP
1222 }
1223 }
1224
03c0730f
YW
1225 if (default_action != default_action_override)
1226 NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
1227 int id;
2331c02d 1228
03c0730f
YW
1229 id = seccomp_syscall_resolve_name(name);
1230 if (id < 0)
1231 continue;
2331c02d 1232
03c0730f
YW
1233 /* Ignore the syscall if it was already handled above */
1234 if (hashmap_contains(filter, INT_TO_PTR(id + 1)))
1235 continue;
2331c02d 1236
03c0730f
YW
1237 r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
1238 if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */
1239 return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
1240 name, id);
1241 }
2331c02d 1242
e6c5386d
ZJS
1243#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
1244 /* We have a large filter here, so let's turn on the binary tree mode if possible. */
1245 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2);
1246 if (r < 0)
1247 log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
1248#endif
1249
469830d1 1250 r = seccomp_load(seccomp);
3c098014
ZJS
1251 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1252 return r;
1253 if (r < 0)
a52765a5 1254 log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
1862b310 1255 seccomp_arch_to_string(arch));
469830d1
LP
1256 }
1257
1258 return 0;
add00535
LP
1259}
1260
58f6ab44 1261int seccomp_parse_syscall_filter(
898748d8
YW
1262 const char *name,
1263 int errno_num,
1264 Hashmap *filter,
13d92c63 1265 SeccompParseFlags flags,
898748d8
YW
1266 const char *unit,
1267 const char *filename,
1268 unsigned line) {
1269
1270 int r;
1271
1272 assert(name);
1273 assert(filter);
1274
084a46d7
YW
1275 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1276 return -EINVAL;
1277
898748d8
YW
1278 if (name[0] == '@') {
1279 const SyscallFilterSet *set;
898748d8
YW
1280
1281 set = syscall_filter_set_find(name);
1282 if (!set) {
9e29ee40 1283 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1284 return -EINVAL;
13d92c63 1285
9e29ee40 1286 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
13d92c63
LP
1287 "Unknown system call group, ignoring: %s", name);
1288 return 0;
898748d8
YW
1289 }
1290
1291 NULSTR_FOREACH(i, set->value) {
3c098014
ZJS
1292 /* Call ourselves again, for the group to parse. Note that we downgrade logging here
1293 * (i.e. take away the SECCOMP_PARSE_LOG flag) since any issues in the group table
1294 * are our own problem, not a problem in user configuration data and we shouldn't
1295 * pretend otherwise by complaining about them. */
58f6ab44 1296 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1297 if (r < 0)
1298 return r;
1299 }
1300 } else {
1301 int id;
1302
1303 id = seccomp_syscall_resolve_name(name);
1304 if (id == __NR_SCMP_ERROR) {
9e29ee40 1305 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1306 return -EINVAL;
13d92c63 1307
9e29ee40 1308 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
309a4212 1309 "System call %s is not known, ignoring.", name);
13d92c63 1310 return 0;
898748d8
YW
1311 }
1312
3c098014
ZJS
1313 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove it
1314 * from the list. The entries in allow-list with non-negative error value will be handled
1315 * with SCMP_ACT_ERRNO() instead of the default action. */
68acc1af
YW
1316 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1317 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
898748d8
YW
1318 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1319 if (r < 0)
851ee70a
LW
1320 switch (r) {
1321 case -ENOMEM:
9e29ee40 1322 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
851ee70a 1323 case -EEXIST:
9d7fe7c6
LW
1324 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1325 break;
851ee70a
LW
1326 default:
1327 return r;
1328 }
898748d8
YW
1329 } else
1330 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1331 }
1332
1333 return 0;
1334}
1335
add00535 1336int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1337 uint32_t arch;
add00535
LP
1338 int r;
1339
f1d34068 1340 if (DEBUG_LOGGING) {
add00535
LP
1341 _cleanup_free_ char *s = NULL;
1342
86c2a9f1 1343 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1344 log_debug("Restricting namespace to: %s.", strna(s));
1345 }
1346
1347 /* NOOP? */
d7a0f1f4 1348 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
add00535
LP
1349 return 0;
1350
469830d1
LP
1351 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1352 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
add00535 1353
30868c1c 1354 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1355
1356 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1357 if (r < 0)
1358 return r;
1359
30193fe8
ZJS
1360 /* We cannot filter on individual flags to clone3(), and we need to disable the
1361 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1362 * users shall fall back to clone(), as if on an older kernel.
1363 *
1364 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1365 * https://github.com/moby/moby/issues/42680. */
1366
1367 r = seccomp_rule_add_exact(
1368 seccomp,
1369 SCMP_ACT_ERRNO(ENOSYS),
1370 SCMP_SYS(clone3),
1371 0);
1372 if (r < 0)
3c098014
ZJS
1373 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m",
1374 seccomp_arch_to_string(arch));
30193fe8 1375
469830d1 1376 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
3c098014
ZJS
1377 /* If every single kind of namespace shall be prohibited, then let's block the whole
1378 * setns() syscall altogether. */
469830d1
LP
1379 r = seccomp_rule_add_exact(
1380 seccomp,
1381 SCMP_ACT_ERRNO(EPERM),
1382 SCMP_SYS(setns),
1383 0);
1384 else
3c098014
ZJS
1385 /* Otherwise, block only the invocations with the appropriate flags in the loop
1386 * below, but also the special invocation with a zero flags argument, right here. */
469830d1
LP
1387 r = seccomp_rule_add_exact(
1388 seccomp,
1389 SCMP_ACT_ERRNO(EPERM),
1390 SCMP_SYS(setns),
1391 1,
1392 SCMP_A1(SCMP_CMP_EQ, 0));
1393 if (r < 0) {
3c098014
ZJS
1394 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1395 seccomp_arch_to_string(arch));
469830d1
LP
1396 continue;
1397 }
1398
241b1577 1399 for (unsigned i = 0; namespace_info[i].proc_name; i++) {
469830d1
LP
1400 unsigned long f;
1401
241b1577 1402 f = namespace_info[i].clone_flag;
d7a0f1f4 1403 if (FLAGS_SET(retain, f)) {
241b1577 1404 log_debug("Permitting %s.", namespace_info[i].proc_name);
469830d1
LP
1405 continue;
1406 }
1407
30868c1c 1408 log_trace("Blocking %s.", namespace_info[i].proc_name);
469830d1
LP
1409
1410 r = seccomp_rule_add_exact(
1411 seccomp,
1412 SCMP_ACT_ERRNO(EPERM),
1413 SCMP_SYS(unshare),
1414 1,
1415 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1416 if (r < 0) {
3c098014
ZJS
1417 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m",
1418 seccomp_arch_to_string(arch));
469830d1
LP
1419 break;
1420 }
1421
511ceb1f
ZJS
1422 /* On s390/s390x the first two parameters to clone are switched */
1423 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1424 r = seccomp_rule_add_exact(
1425 seccomp,
1426 SCMP_ACT_ERRNO(EPERM),
1427 SCMP_SYS(clone),
1428 1,
1429 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1430 else
1431 r = seccomp_rule_add_exact(
1432 seccomp,
1433 SCMP_ACT_ERRNO(EPERM),
1434 SCMP_SYS(clone),
1435 1,
1436 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1 1437 if (r < 0) {
3c098014
ZJS
1438 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m",
1439 seccomp_arch_to_string(arch));
469830d1
LP
1440 break;
1441 }
1442
1443 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1444 r = seccomp_rule_add_exact(
1445 seccomp,
1446 SCMP_ACT_ERRNO(EPERM),
1447 SCMP_SYS(setns),
1448 1,
1449 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1450 if (r < 0) {
3c098014
ZJS
1451 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1452 seccomp_arch_to_string(arch));
469830d1
LP
1453 break;
1454 }
1455 }
1456 }
1457 if (r < 0)
1458 continue;
1459
1460 r = seccomp_load(seccomp);
3c098014
ZJS
1461 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1462 return r;
1463 if (r < 0)
1464 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m",
1465 seccomp_arch_to_string(arch));
469830d1
LP
1466 }
1467
1468 return 0;
1469}
1470
1471int seccomp_protect_sysctl(void) {
1472 uint32_t arch;
1473 int r;
1474
1475 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1476 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1477
30868c1c 1478 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1479
f9252236
AJ
1480 if (IN_SET(arch,
1481 SCMP_ARCH_AARCH64,
f9d3fb6b
XW
1482#ifdef SCMP_ARCH_LOONGARCH64
1483 SCMP_ARCH_LOONGARCH64,
1484#endif
f9252236
AJ
1485#ifdef SCMP_ARCH_RISCV64
1486 SCMP_ARCH_RISCV64,
1487#endif
1488 SCMP_ARCH_X32
1489 ))
2e64e8f4
ZJS
1490 /* No _sysctl syscall */
1491 continue;
1492
469830d1
LP
1493 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1494 if (r < 0)
1495 return r;
1496
1497 r = seccomp_rule_add_exact(
add00535
LP
1498 seccomp,
1499 SCMP_ACT_ERRNO(EPERM),
469830d1 1500 SCMP_SYS(_sysctl),
add00535 1501 0);
469830d1 1502 if (r < 0) {
3c098014
ZJS
1503 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m",
1504 seccomp_arch_to_string(arch));
469830d1
LP
1505 continue;
1506 }
1507
1508 r = seccomp_load(seccomp);
3c098014
ZJS
1509 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1510 return r;
1511 if (r < 0)
1512 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m",
1513 seccomp_arch_to_string(arch));
469830d1
LP
1514 }
1515
1516 return 0;
1517}
1518
620dbdd2
KK
1519int seccomp_protect_syslog(void) {
1520 uint32_t arch;
1521 int r;
1522
1523 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1524 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1525
1526 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1527 if (r < 0)
1528 return r;
1529
1530 r = seccomp_rule_add_exact(
1531 seccomp,
1532 SCMP_ACT_ERRNO(EPERM),
1533 SCMP_SYS(syslog),
1534 0);
1535
1536 if (r < 0) {
1537 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1538 continue;
1539 }
1540
1541 r = seccomp_load(seccomp);
3c098014
ZJS
1542 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1543 return r;
1544 if (r < 0)
1545 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m",
1546 seccomp_arch_to_string(arch));
620dbdd2
KK
1547 }
1548
1549 return 0;
1550}
1551
6b000af4 1552int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1553 uint32_t arch;
1554 int r;
1555
1556 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1557 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1558 bool supported;
469830d1 1559
30868c1c 1560 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1561
9606bc4b
LP
1562 switch (arch) {
1563
1564 case SCMP_ARCH_X86_64:
1565 case SCMP_ARCH_X32:
1566 case SCMP_ARCH_ARM:
1567 case SCMP_ARCH_AARCH64:
f9d3fb6b
XW
1568#ifdef SCMP_ARCH_LOONGARCH64
1569 case SCMP_ARCH_LOONGARCH64:
1570#endif
f5aeac14
JC
1571 case SCMP_ARCH_MIPSEL64N32:
1572 case SCMP_ARCH_MIPS64N32:
1573 case SCMP_ARCH_MIPSEL64:
1574 case SCMP_ARCH_MIPS64:
f9252236
AJ
1575#ifdef SCMP_ARCH_RISCV64
1576 case SCMP_ARCH_RISCV64:
1577#endif
9606bc4b
LP
1578 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1579 supported = true;
1580 break;
1581
9606bc4b
LP
1582 case SCMP_ARCH_S390:
1583 case SCMP_ARCH_S390X:
da1921a5 1584 case SCMP_ARCH_X86:
f5aeac14
JC
1585 case SCMP_ARCH_MIPSEL:
1586 case SCMP_ARCH_MIPS:
344e6b62
SJ
1587#ifdef SCMP_ARCH_PARISC
1588 case SCMP_ARCH_PARISC:
1589#endif
1590#ifdef SCMP_ARCH_PARISC64
1591 case SCMP_ARCH_PARISC64:
1592#endif
d5923e38
ZJS
1593 case SCMP_ARCH_PPC:
1594 case SCMP_ARCH_PPC64:
1595 case SCMP_ARCH_PPC64LE:
9606bc4b
LP
1596 default:
1597 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1598 * don't know */
1599 supported = false;
9606bc4b
LP
1600 }
1601
1602 if (!supported)
1603 continue;
1604
469830d1
LP
1605 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1606 if (r < 0)
1607 return r;
1608
6b000af4 1609 if (allow_list) {
077e8fc0 1610 int first = 0, last = 0;
469830d1
LP
1611 void *afp;
1612
6b000af4
LP
1613 /* If this is an allow list, we first block the address families that are out of
1614 * range and then everything that is not in the set. First, we find the lowest and
1615 * highest address family in the set. */
469830d1 1616
90e74a66 1617 SET_FOREACH(afp, address_families) {
077e8fc0 1618 int af = PTR_TO_INT(afp);
469830d1
LP
1619
1620 if (af <= 0 || af >= af_max())
1621 continue;
1622
1623 if (first == 0 || af < first)
1624 first = af;
1625
1626 if (last == 0 || af > last)
1627 last = af;
1628 }
1629
1630 assert((first == 0) == (last == 0));
1631
1632 if (first == 0) {
1633
1634 /* No entries in the valid range, block everything */
1635 r = seccomp_rule_add_exact(
1636 seccomp,
1637 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1638 SCMP_SYS(socket),
1639 0);
1640 if (r < 0) {
3c098014
ZJS
1641 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1642 seccomp_arch_to_string(arch));
469830d1
LP
1643 continue;
1644 }
1645
1646 } else {
1647
1648 /* Block everything below the first entry */
1649 r = seccomp_rule_add_exact(
1650 seccomp,
1651 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1652 SCMP_SYS(socket),
1653 1,
1654 SCMP_A0(SCMP_CMP_LT, first));
1655 if (r < 0) {
3c098014
ZJS
1656 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1657 seccomp_arch_to_string(arch));
469830d1
LP
1658 continue;
1659 }
1660
1661 /* Block everything above the last entry */
1662 r = seccomp_rule_add_exact(
1663 seccomp,
1664 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1665 SCMP_SYS(socket),
1666 1,
1667 SCMP_A0(SCMP_CMP_GT, last));
1668 if (r < 0) {
3c098014
ZJS
1669 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1670 seccomp_arch_to_string(arch));
469830d1
LP
1671 continue;
1672 }
1673
1674 /* Block everything between the first and last entry */
077e8fc0 1675 for (int af = 1; af < af_max(); af++) {
469830d1
LP
1676
1677 if (set_contains(address_families, INT_TO_PTR(af)))
1678 continue;
1679
1680 r = seccomp_rule_add_exact(
1681 seccomp,
1682 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1683 SCMP_SYS(socket),
1684 1,
1685 SCMP_A0(SCMP_CMP_EQ, af));
1686 if (r < 0)
1687 break;
1688 }
469830d1 1689 if (r < 0) {
3c098014
ZJS
1690 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1691 seccomp_arch_to_string(arch));
469830d1
LP
1692 continue;
1693 }
1694 }
1695
1696 } else {
1697 void *af;
1698
6b000af4
LP
1699 /* If this is a deny list, then generate one rule for each address family that are
1700 * then combined in OR checks. */
469830d1 1701
90e74a66 1702 SET_FOREACH(af, address_families) {
469830d1
LP
1703 r = seccomp_rule_add_exact(
1704 seccomp,
1705 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1706 SCMP_SYS(socket),
1707 1,
1708 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1709 if (r < 0)
1710 break;
1711 }
469830d1 1712 if (r < 0) {
3c098014
ZJS
1713 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1714 seccomp_arch_to_string(arch));
469830d1
LP
1715 continue;
1716 }
1717 }
1718
1719 r = seccomp_load(seccomp);
3c098014
ZJS
1720 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1721 return r;
1722 if (r < 0)
1723 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m",
1724 seccomp_arch_to_string(arch));
469830d1
LP
1725 }
1726
1727 return 0;
1728}
1729
a9002749 1730int seccomp_restrict_realtime_full(int error_code) {
469830d1
LP
1731 static const int permitted_policies[] = {
1732 SCHED_OTHER,
1733 SCHED_BATCH,
1734 SCHED_IDLE,
1735 };
1736
1737 int r, max_policy = 0;
1738 uint32_t arch;
469830d1 1739
a9002749
YW
1740 assert(error_code > 0);
1741
469830d1 1742 /* Determine the highest policy constant we want to allow */
ddb8a639
I
1743 FOREACH_ELEMENT(policy, permitted_policies)
1744 if (*policy > max_policy)
1745 max_policy = *policy;
469830d1
LP
1746
1747 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1748 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1749 int p;
1750
30868c1c 1751 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1
LP
1752
1753 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1754 if (r < 0)
1755 return r;
1756
1757 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1758 * allow list. */
469830d1
LP
1759 for (p = 0; p < max_policy; p++) {
1760 bool good = false;
1761
6b000af4 1762 /* Check if this is in the allow list. */
ddb8a639
I
1763 FOREACH_ELEMENT(policy, permitted_policies)
1764 if (*policy == p) {
469830d1
LP
1765 good = true;
1766 break;
1767 }
1768
1769 if (good)
1770 continue;
1771
1772 /* Deny this policy */
1773 r = seccomp_rule_add_exact(
1774 seccomp,
a9002749 1775 SCMP_ACT_ERRNO(error_code),
469830d1
LP
1776 SCMP_SYS(sched_setscheduler),
1777 1,
1778 SCMP_A1(SCMP_CMP_EQ, p));
1779 if (r < 0) {
3c098014
ZJS
1780 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1781 seccomp_arch_to_string(arch));
469830d1
LP
1782 continue;
1783 }
1784 }
1785
6b000af4
LP
1786 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1787 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1788 r = seccomp_rule_add_exact(
add00535 1789 seccomp,
a9002749 1790 SCMP_ACT_ERRNO(error_code),
469830d1 1791 SCMP_SYS(sched_setscheduler),
add00535 1792 1,
469830d1
LP
1793 SCMP_A1(SCMP_CMP_GT, max_policy));
1794 if (r < 0) {
3c098014
ZJS
1795 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1796 seccomp_arch_to_string(arch));
469830d1
LP
1797 continue;
1798 }
add00535 1799
469830d1 1800 r = seccomp_load(seccomp);
3c098014
ZJS
1801 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1802 return r;
1803 if (r < 0)
1804 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m",
1805 seccomp_arch_to_string(arch));
469830d1
LP
1806 }
1807
1808 return 0;
1809}
1810
6dc66688
ZJS
1811static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1812 uint32_t arch,
1813 int nr,
14cb109d 1814 unsigned arg_cnt,
6dc66688
ZJS
1815 const struct scmp_arg_cmp arg) {
1816 int r;
1817
1818 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1819 if (r < 0) {
1820 _cleanup_free_ char *n = NULL;
1821
1822 n = seccomp_syscall_resolve_num_arch(arch, nr);
1823 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1824 strna(n),
1825 seccomp_arch_to_string(arch));
1826 }
1827
1828 return r;
1829}
1830
2a8d6e63 1831/* For known architectures, check that syscalls are indeed defined or not. */
f9d3fb6b 1832#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64)
2a8d6e63
ZJS
1833assert_cc(SCMP_SYS(shmget) > 0);
1834assert_cc(SCMP_SYS(shmat) > 0);
1835assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1836#endif
6dc66688 1837
469830d1
LP
1838int seccomp_memory_deny_write_execute(void) {
1839 uint32_t arch;
b069c2a3 1840 unsigned loaded = 0;
469830d1
LP
1841
1842 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1843 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1844 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1845
30868c1c 1846 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
469830d1 1847
8a50cf69
LP
1848 switch (arch) {
1849
bed4668d
CE
1850 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1851 * We ignore that here, which means there's still a way to get writable/executable
344e6b62
SJ
1852 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1853 *
1854 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1855 * on that front (kernel work done in 5.18).
1856 */
bed4668d 1857
8a50cf69 1858 case SCMP_ARCH_X86:
57311925 1859 case SCMP_ARCH_S390:
8a50cf69
LP
1860 filter_syscall = SCMP_SYS(mmap2);
1861 block_syscall = SCMP_SYS(mmap);
bed4668d 1862 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1863 break;
1864
63d00dfb 1865 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1866 case SCMP_ARCH_PPC64:
1867 case SCMP_ARCH_PPC64LE:
bed4668d 1868 case SCMP_ARCH_S390X:
2a8d6e63 1869 filter_syscall = SCMP_SYS(mmap);
bed4668d 1870 /* shmat multiplexed, see above */
8a50cf69
LP
1871 break;
1872
4278d1f5
ZJS
1873 case SCMP_ARCH_ARM:
1874 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1875 shmat_syscall = SCMP_SYS(shmat);
1876 break;
1877
8a50cf69
LP
1878 case SCMP_ARCH_X86_64:
1879 case SCMP_ARCH_X32:
79873bc8 1880 case SCMP_ARCH_AARCH64:
f9d3fb6b
XW
1881#ifdef SCMP_ARCH_LOONGARCH64
1882 case SCMP_ARCH_LOONGARCH64:
1883#endif
f9252236
AJ
1884#ifdef SCMP_ARCH_RISCV64
1885 case SCMP_ARCH_RISCV64:
1886#endif
f9d3fb6b 1887 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */
8a50cf69
LP
1888 shmat_syscall = SCMP_SYS(shmat);
1889 break;
1890
1891 /* Please add more definitions here, if you port systemd to other architectures! */
1892
f9d3fb6b 1893#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64)
8a50cf69
LP
1894#warning "Consider adding the right mmap() syscall definitions here!"
1895#endif
1896 }
1897
1898 /* Can't filter mmap() on this arch, then skip it */
1899 if (filter_syscall == 0)
1900 continue;
1901
469830d1
LP
1902 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1903 if (r < 0)
1904 return r;
1905
6dc66688
ZJS
1906 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1907 1,
1908 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1909 if (r < 0)
1910 continue;
8a50cf69
LP
1911
1912 if (block_syscall != 0) {
6dc66688
ZJS
1913 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1914 if (r < 0)
8a50cf69 1915 continue;
add00535 1916 }
a3be2849 1917
6dc66688
ZJS
1918 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1919 1,
b835eeb4
ZJS
1920 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1921 if (r < 0)
1922 continue;
1923
1924 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1925 1,
6dc66688
ZJS
1926 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1927 if (r < 0)
469830d1 1928 continue;
add00535 1929
67fb5f33 1930 if (shmat_syscall > 0) {
5ef3ed97 1931 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1932 1,
1933 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1934 if (r < 0)
8a50cf69 1935 continue;
469830d1
LP
1936 }
1937
1938 r = seccomp_load(seccomp);
3c098014
ZJS
1939 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1940 return r;
1941 if (r < 0)
b069c2a3
ZJS
1942 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1943 seccomp_arch_to_string(arch));
903659e7 1944 loaded++;
469830d1 1945 }
add00535 1946
903659e7 1947 if (loaded == 0)
b069c2a3 1948 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1949
1950 return loaded;
469830d1
LP
1951}
1952
1953int seccomp_restrict_archs(Set *archs) {
1954 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
469830d1 1955 int r;
65976868 1956 bool blocked_new = false;
469830d1
LP
1957
1958 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1959 * list.
1960 *
1961 * There are some qualifications. However the most important use is to stop processes from bypassing
1962 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1963 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1964
2428aaf8
AJ
1965 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1966 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1967 * to run a program with the restrictions applied. */
469830d1
LP
1968 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1969 if (!seccomp)
1970 return -ENOMEM;
1971
65976868
GDF
1972 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1973 uint32_t arch = seccomp_local_archs[i];
2428aaf8 1974
f833df38
BB
1975 /* See above comment, our "native" architecture is never blocked. */
1976 if (arch == seccomp_arch_native())
1977 continue;
1978
65976868
GDF
1979 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1980 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1981 continue;
2428aaf8 1982
65976868 1983 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
2428aaf8 1984
65976868
GDF
1985 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1986 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1987 * The important thing is that you can block the old 32-bit x86 syscalls.
1988 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1989 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1990 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1991
1992 if (block) {
1993 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1994 blocked_new = true;
1995 } else {
1996 r = seccomp_arch_add(seccomp, arch);
1997 if (r < 0 && r != -EEXIST)
1998 return r;
1999 }
add00535
LP
2000 }
2001
65976868
GDF
2002 /* All architectures that will be blocked by the seccomp program were
2003 * already blocked. */
2004 if (!blocked_new)
2005 return 0;
2006
469830d1
LP
2007 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2008 if (r < 0)
2009 return r;
add00535 2010
1c6af69b 2011 r = seccomp_load(seccomp);
3c098014
ZJS
2012 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2013 return r;
2014 if (r < 0)
1c6af69b
LP
2015 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
2016
2017 return 0;
a3be2849 2018}
b16bd535 2019
8621c646 2020int parse_syscall_archs(char **l, Set **archs) {
b16bd535
YW
2021 int r;
2022
2023 assert(l);
8621c646 2024 assert(archs);
b16bd535
YW
2025
2026 STRV_FOREACH(s, l) {
2027 uint32_t a;
2028
2029 r = seccomp_arch_from_string(*s, &a);
2030 if (r < 0)
2031 return -EINVAL;
2032
8621c646 2033 r = set_ensure_put(archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
2034 if (r < 0)
2035 return -ENOMEM;
2036 }
2037
b16bd535
YW
2038 return 0;
2039}
165a31c0 2040
b89a262e
YW
2041int seccomp_filter_set_add_by_name(Hashmap *filter, bool add, const char *name) {
2042 assert(filter);
2043 assert(name);
165a31c0 2044
b89a262e
YW
2045 if (name[0] == '@') {
2046 const SyscallFilterSet *more;
165a31c0 2047
b89a262e
YW
2048 more = syscall_filter_set_find(name);
2049 if (!more)
2050 return -ENXIO;
165a31c0 2051
b89a262e
YW
2052 return seccomp_filter_set_add(filter, add, more);
2053 }
165a31c0 2054
b89a262e
YW
2055 int id = seccomp_syscall_resolve_name(name);
2056 if (id == __NR_SCMP_ERROR) {
2057 log_debug("System call %s is not known, ignoring.", name);
2058 return 0;
2059 }
165a31c0 2060
b89a262e
YW
2061 if (add)
2062 return hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0 2063
b89a262e
YW
2064 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
2065 return 0;
2066}
165a31c0 2067
b89a262e
YW
2068int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
2069 int r;
2070
2071 assert(filter);
2072 assert(set);
2073
2074 NULSTR_FOREACH(i, set->value) {
2075 r = seccomp_filter_set_add_by_name(filter, add, i);
2076 if (r < 0)
2077 return r;
165a31c0
LP
2078 }
2079
2080 return 0;
2081}
78e864e5
TM
2082
2083int seccomp_lock_personality(unsigned long personality) {
72eafe71 2084 uint32_t arch;
78e864e5
TM
2085 int r;
2086
72eafe71
LP
2087 if (personality >= PERSONALITY_INVALID)
2088 return -EINVAL;
78e864e5 2089
72eafe71
LP
2090 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2091 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 2092
72eafe71
LP
2093 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2094 if (r < 0)
2095 return r;
2096
2097 r = seccomp_rule_add_exact(
2098 seccomp,
2099 SCMP_ACT_ERRNO(EPERM),
2100 SCMP_SYS(personality),
2101 1,
2102 SCMP_A0(SCMP_CMP_NE, personality));
448ac526 2103 if (r < 0) {
3c098014
ZJS
2104 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
2105 seccomp_arch_to_string(arch));
448ac526
LP
2106 continue;
2107 }
72eafe71
LP
2108
2109 r = seccomp_load(seccomp);
3c098014
ZJS
2110 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2111 return r;
2112 if (r < 0)
2113 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m",
2114 seccomp_arch_to_string(arch));
72eafe71
LP
2115 }
2116
2117 return 0;
78e864e5 2118}
aecd5ac6
TM
2119
2120int seccomp_protect_hostname(void) {
2121 uint32_t arch;
2122 int r;
2123
2124 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2125 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2126
2127 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2128 if (r < 0)
2129 return r;
2130
2131 r = seccomp_rule_add_exact(
2132 seccomp,
2133 SCMP_ACT_ERRNO(EPERM),
2134 SCMP_SYS(sethostname),
2135 0);
9e6e543c 2136 if (r < 0) {
3c098014
ZJS
2137 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m",
2138 seccomp_arch_to_string(arch));
aecd5ac6 2139 continue;
9e6e543c 2140 }
aecd5ac6
TM
2141
2142 r = seccomp_rule_add_exact(
2143 seccomp,
2144 SCMP_ACT_ERRNO(EPERM),
2145 SCMP_SYS(setdomainname),
2146 0);
9e6e543c 2147 if (r < 0) {
3c098014
ZJS
2148 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m",
2149 seccomp_arch_to_string(arch));
aecd5ac6 2150 continue;
9e6e543c 2151 }
aecd5ac6
TM
2152
2153 r = seccomp_load(seccomp);
3c098014
ZJS
2154 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2155 return r;
2156 if (r < 0)
2157 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m",
2158 seccomp_arch_to_string(arch));
aecd5ac6
TM
2159 }
2160
2161 return 0;
2162}
3c27973b 2163
da4dc9a6
ZJS
2164static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
2165 /* Checks the mode_t parameter of the following system calls:
2166 *
8b45281d 2167 * → chmod() + fchmod() + fchmodat() + fchmodat2()
da4dc9a6
ZJS
2168 * → open() + creat() + openat()
2169 * → mkdir() + mkdirat()
2170 * → mknod() + mknodat()
2171 *
2172 * Returns error if *everything* failed, and 0 otherwise.
2173 */
6d95e7d9 2174 int r;
da4dc9a6
ZJS
2175 bool any = false;
2176
2177 r = seccomp_rule_add_exact(
2178 seccomp,
2179 SCMP_ACT_ERRNO(EPERM),
2180 SCMP_SYS(chmod),
2181 1,
2182 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2183 if (r < 0)
2184 log_debug_errno(r, "Failed to add filter for chmod: %m");
2185 else
2186 any = true;
2187
2188 r = seccomp_rule_add_exact(
2189 seccomp,
2190 SCMP_ACT_ERRNO(EPERM),
2191 SCMP_SYS(fchmod),
2192 1,
2193 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2194 if (r < 0)
2195 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2196 else
2197 any = true;
2198
2199 r = seccomp_rule_add_exact(
2200 seccomp,
2201 SCMP_ACT_ERRNO(EPERM),
2202 SCMP_SYS(fchmodat),
2203 1,
2204 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2205 if (r < 0)
2206 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2207 else
2208 any = true;
2209
8b45281d
AM
2210#if defined(__SNR_fchmodat2)
2211 r = seccomp_rule_add_exact(
2212 seccomp,
2213 SCMP_ACT_ERRNO(EPERM),
2214 SCMP_SYS(fchmodat2),
2215 1,
2216 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2217#else
2218 /* It looks like this libseccomp does not know about fchmodat2().
2219 * Pretend the fchmodat2() system call is not supported at all,
2220 * regardless of the kernel version. */
2221 r = seccomp_rule_add_exact(
2222 seccomp,
2223 SCMP_ACT_ERRNO(ENOSYS),
2224 __NR_fchmodat2,
2225 0);
2226#endif
2227 if (r < 0)
2228 log_debug_errno(r, "Failed to add filter for fchmodat2: %m");
2229 else
2230 any = true;
2231
da4dc9a6
ZJS
2232 r = seccomp_rule_add_exact(
2233 seccomp,
2234 SCMP_ACT_ERRNO(EPERM),
2235 SCMP_SYS(mkdir),
2236 1,
2237 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2238 if (r < 0)
2239 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2240 else
2241 any = true;
2242
2243 r = seccomp_rule_add_exact(
2244 seccomp,
2245 SCMP_ACT_ERRNO(EPERM),
2246 SCMP_SYS(mkdirat),
2247 1,
2248 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2249 if (r < 0)
2250 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2251 else
2252 any = true;
2253
2254 r = seccomp_rule_add_exact(
2255 seccomp,
2256 SCMP_ACT_ERRNO(EPERM),
2257 SCMP_SYS(mknod),
2258 1,
2259 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2260 if (r < 0)
2261 log_debug_errno(r, "Failed to add filter for mknod: %m");
2262 else
2263 any = true;
2264
2265 r = seccomp_rule_add_exact(
2266 seccomp,
2267 SCMP_ACT_ERRNO(EPERM),
2268 SCMP_SYS(mknodat),
2269 1,
2270 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2271 if (r < 0)
2272 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2273 else
2274 any = true;
2275
da4dc9a6
ZJS
2276 r = seccomp_rule_add_exact(
2277 seccomp,
2278 SCMP_ACT_ERRNO(EPERM),
2279 SCMP_SYS(open),
2280 2,
2281 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2282 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2283 if (r < 0)
2284 log_debug_errno(r, "Failed to add filter for open: %m");
2285 else
2286 any = true;
da4dc9a6
ZJS
2287
2288 r = seccomp_rule_add_exact(
2289 seccomp,
2290 SCMP_ACT_ERRNO(EPERM),
2291 SCMP_SYS(openat),
2292 2,
2293 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2294 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2295 if (r < 0)
2296 log_debug_errno(r, "Failed to add filter for openat: %m");
2297 else
2298 any = true;
2299
ecc04067
LP
2300#if defined(__SNR_openat2)
2301 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2302 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2303 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
57353d29
MG
2304 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2305 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2306 * to call open() or openat() instead. We can properly enforce policy for those functions. */
ecc04067
LP
2307 r = seccomp_rule_add_exact(
2308 seccomp,
57353d29 2309 SCMP_ACT_ERRNO(ENOSYS),
ecc04067
LP
2310 SCMP_SYS(openat2),
2311 0);
2312 if (r < 0)
2313 log_debug_errno(r, "Failed to add filter for openat2: %m");
2314 else
2315 any = true;
2316#endif
2317
da4dc9a6
ZJS
2318 r = seccomp_rule_add_exact(
2319 seccomp,
2320 SCMP_ACT_ERRNO(EPERM),
2321 SCMP_SYS(creat),
2322 1,
2323 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2324 if (r < 0)
2325 log_debug_errno(r, "Failed to add filter for creat: %m");
2326 else
2327 any = true;
2328
2329 return any ? 0 : r;
2330}
2331
3c27973b
LP
2332int seccomp_restrict_suid_sgid(void) {
2333 uint32_t arch;
da4dc9a6 2334 int r, k;
3c27973b
LP
2335
2336 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2337 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2338
2339 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2340 if (r < 0)
2341 return r;
2342
da4dc9a6
ZJS
2343 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2344 if (r < 0)
3c098014
ZJS
2345 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m",
2346 seccomp_arch_to_string(arch));
3c27973b 2347
da4dc9a6
ZJS
2348 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2349 if (k < 0)
a539314a 2350 log_debug_errno(k, "Failed to add sgid rule for architecture %s, ignoring: %m",
3c098014 2351 seccomp_arch_to_string(arch));
3c27973b 2352
da4dc9a6 2353 if (r < 0 && k < 0)
3c27973b 2354 continue;
3c27973b
LP
2355
2356 r = seccomp_load(seccomp);
3c098014
ZJS
2357 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2358 return r;
2359 if (r < 0)
2360 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m",
2361 seccomp_arch_to_string(arch));
3c27973b
LP
2362 }
2363
2364 return 0;
2365}
915fb324
LP
2366
2367uint32_t scmp_act_kill_process(void) {
2368
2369 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2370 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2371 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2372 * for single-threaded apps does the right thing. */
2373
2374#ifdef SCMP_ACT_KILL_PROCESS
2375 if (seccomp_api_get() >= 3)
2376 return SCMP_ACT_KILL_PROCESS;
2377#endif
2378
2379 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2380}
22eadc28
YW
2381
2382int parse_syscall_and_errno(const char *in, char **name, int *error) {
2383 _cleanup_free_ char *n = NULL;
2384 char *p;
2385 int e = -1;
2386
2387 assert(in);
2388 assert(name);
2389 assert(error);
2390
2391 /*
2392 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2393 * If errno is omitted, then error is set to -1.
2394 * Empty syscall name is not allowed.
2395 * Here, we do not check that the syscall name is valid or not.
2396 */
2397
2398 p = strchr(in, ':');
2399 if (p) {
2400 e = seccomp_parse_errno_or_action(p + 1);
2401 if (e < 0)
2402 return e;
2403
2404 n = strndup(in, p - in);
2405 } else
2406 n = strdup(in);
2407
2408 if (!n)
2409 return -ENOMEM;
2410
2411 if (isempty(n))
2412 return -EINVAL;
2413
2414 *error = e;
2415 *name = TAKE_PTR(n);
2416
2417 return 0;
2418}
4a4654e0
LP
2419
2420static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2421 bool any = false;
2422 int r;
2423
2424 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2425 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2426
4a4654e0
LP
2427 r = seccomp_rule_add_exact(
2428 seccomp,
2429 SCMP_ACT_ERRNO(EINVAL),
2430 SCMP_SYS(open),
2431 1,
2432 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2433 if (r < 0)
2434 log_debug_errno(r, "Failed to add filter for open: %m");
2435 else
2436 any = true;
4a4654e0
LP
2437
2438 r = seccomp_rule_add_exact(
2439 seccomp,
2440 SCMP_ACT_ERRNO(EINVAL),
2441 SCMP_SYS(openat),
2442 1,
2443 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2444 if (r < 0)
2445 log_debug_errno(r, "Failed to add filter for openat: %m");
2446 else
2447 any = true;
2448
2449#if defined(__SNR_openat2)
2450 /* The new openat2() system call can't be filtered sensibly, see above. */
2451 r = seccomp_rule_add_exact(
2452 seccomp,
2453 SCMP_ACT_ERRNO(ENOSYS),
2454 SCMP_SYS(openat2),
2455 0);
2456 if (r < 0)
2457 log_debug_errno(r, "Failed to add filter for openat2: %m");
2458 else
2459 any = true;
2460#endif
2461
2462 return any ? 0 : r;
2463}
2464
2465int seccomp_suppress_sync(void) {
2466 uint32_t arch;
2467 int r;
2468
144fbbac
YW
2469 /* This behaves slightly differently from SystemCallFilter=~@sync:0, in that negative fds (which
2470 * we can determine to be invalid) are still refused with EBADF. See #34478.
2471 *
2472 * Additionally, O_SYNC/O_DSYNC are masked. */
4a4654e0
LP
2473
2474 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2475 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
4a4654e0
LP
2476
2477 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2478 if (r < 0)
2479 return r;
2480
2481 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2482 int id;
2483
2484 id = seccomp_syscall_resolve_name(c);
2485 if (id == __NR_SCMP_ERROR) {
2486 log_debug("System call %s is not known, ignoring.", c);
2487 continue;
2488 }
2489
144fbbac
YW
2490 if (STR_IN_SET(c, "fdatasync", "fsync", "sync_file_range", "sync_file_range2", "syncfs"))
2491 r = seccomp_rule_add_exact(
2492 seccomp,
2493 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2494 id,
2495 1,
2496 SCMP_A0(SCMP_CMP_LE, INT_MAX)); /* The rule handles arguments in unsigned. Hence, this
2497 * means non-negative fd matches the rule, and the negative
2498 * fd passed to the syscall (then it fails with EBADF). */
2499 else
2500 r = seccomp_rule_add_exact(
2501 seccomp,
2502 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2503 id,
2504 0);
4a4654e0
LP
2505 if (r < 0)
2506 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2507 }
2508
2509 (void) block_open_flag(seccomp, O_SYNC);
2510#if O_DSYNC != O_SYNC
2511 (void) block_open_flag(seccomp, O_DSYNC);
2512#endif
2513
2514 r = seccomp_load(seccomp);
3c098014
ZJS
2515 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2516 return r;
2517 if (r < 0)
2518 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m",
2519 seccomp_arch_to_string(arch));
4a4654e0
LP
2520 }
2521
2522 return 0;
2523}
69a283c5
DDM
2524
2525#endif
2526
2527bool seccomp_errno_or_action_is_valid(int n) {
2528 return n == SECCOMP_ERROR_NUMBER_KILL || errno_is_valid(n);
2529}
2530
2531int seccomp_parse_errno_or_action(const char *p) {
2532 if (streq_ptr(p, "kill"))
2533 return SECCOMP_ERROR_NUMBER_KILL;
2534 return parse_errno(p);
2535}
2536
2537const char* seccomp_errno_or_action_to_string(int num) {
2538 if (num == SECCOMP_ERROR_NUMBER_KILL)
2539 return "kill";
2540 return errno_to_name(num);
2541}