]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
strv: make iterator in STRV_FOREACH() declaread in the loop
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
a8fbdf54 6#include <stddef.h>
469830d1 7#include <sys/mman.h>
d347d902 8#include <sys/prctl.h>
469830d1 9#include <sys/shm.h>
3c27973b 10#include <sys/stat.h>
57183d11 11
e83156c2
YW
12/* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13#include "missing_syscall_def.h"
14#include <seccomp.h>
15
469830d1 16#include "af-list.h"
add00535 17#include "alloc-util.h"
44aaddad 18#include "env-util.h"
d8b4d14d 19#include "errno-list.h"
a8fbdf54 20#include "macro.h"
add00535 21#include "nsflags.h"
d8b4d14d 22#include "nulstr-util.h"
78e864e5 23#include "process-util.h"
cf0fbc49 24#include "seccomp-util.h"
b16bd535 25#include "set.h"
07630cea 26#include "string-util.h"
b16bd535 27#include "strv.h"
469830d1 28
65976868
GDF
29/* This array will be modified at runtime as seccomp_restrict_archs is called. */
30uint32_t seccomp_local_archs[] = {
469830d1 31
6b000af4 32 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
33
34#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
35 SCMP_ARCH_X86,
36 SCMP_ARCH_X86_64,
f2d9751c
LP
37 SCMP_ARCH_X32, /* native */
38#elif defined(__x86_64__) && !defined(__ILP32__)
39 SCMP_ARCH_X86,
469830d1 40 SCMP_ARCH_X32,
f2d9751c
LP
41 SCMP_ARCH_X86_64, /* native */
42#elif defined(__i386__)
43 SCMP_ARCH_X86,
44#elif defined(__aarch64__)
469830d1 45 SCMP_ARCH_ARM,
f2d9751c
LP
46 SCMP_ARCH_AARCH64, /* native */
47#elif defined(__arm__)
48 SCMP_ARCH_ARM,
49#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
50 SCMP_ARCH_MIPSEL,
51 SCMP_ARCH_MIPS, /* native */
52#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 53 SCMP_ARCH_MIPS,
f2d9751c
LP
54 SCMP_ARCH_MIPSEL, /* native */
55#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPSEL,
57 SCMP_ARCH_MIPS,
58 SCMP_ARCH_MIPSEL64N32,
469830d1 59 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
60 SCMP_ARCH_MIPSEL64,
61 SCMP_ARCH_MIPS64, /* native */
62#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
63 SCMP_ARCH_MIPS,
469830d1 64 SCMP_ARCH_MIPSEL,
f2d9751c
LP
65 SCMP_ARCH_MIPS64N32,
66 SCMP_ARCH_MIPSEL64N32,
67 SCMP_ARCH_MIPS64,
68 SCMP_ARCH_MIPSEL64, /* native */
69#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPSEL,
71 SCMP_ARCH_MIPS,
469830d1 72 SCMP_ARCH_MIPSEL64,
f2d9751c 73 SCMP_ARCH_MIPS64,
469830d1 74 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
75 SCMP_ARCH_MIPS64N32, /* native */
76#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
77 SCMP_ARCH_MIPS,
78 SCMP_ARCH_MIPSEL,
79 SCMP_ARCH_MIPS64,
80 SCMP_ARCH_MIPSEL64,
81 SCMP_ARCH_MIPS64N32,
82 SCMP_ARCH_MIPSEL64N32, /* native */
83#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 84 SCMP_ARCH_PPC,
469830d1 85 SCMP_ARCH_PPC64LE,
f2d9751c
LP
86 SCMP_ARCH_PPC64, /* native */
87#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
88 SCMP_ARCH_PPC,
89 SCMP_ARCH_PPC64,
90 SCMP_ARCH_PPC64LE, /* native */
91#elif defined(__powerpc__)
92 SCMP_ARCH_PPC,
f9252236
AJ
93#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
94 SCMP_ARCH_RISCV64,
f2d9751c
LP
95#elif defined(__s390x__)
96 SCMP_ARCH_S390,
97 SCMP_ARCH_S390X, /* native */
98#elif defined(__s390__)
469830d1 99 SCMP_ARCH_S390,
469830d1 100#endif
65976868 101 SECCOMP_LOCAL_ARCH_END
469830d1 102 };
57183d11
LP
103
104const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
105 /* Maintain order used in <seccomp.h>.
106 *
107 * Names used here should be the same as those used for ConditionArchitecture=,
108 * except for "subarchitectures" like x32. */
57183d11 109
aa34055f
ZJS
110 switch(c) {
111 case SCMP_ARCH_NATIVE:
57183d11 112 return "native";
aa34055f 113 case SCMP_ARCH_X86:
57183d11 114 return "x86";
aa34055f 115 case SCMP_ARCH_X86_64:
57183d11 116 return "x86-64";
aa34055f 117 case SCMP_ARCH_X32:
57183d11 118 return "x32";
aa34055f 119 case SCMP_ARCH_ARM:
57183d11 120 return "arm";
aa34055f
ZJS
121 case SCMP_ARCH_AARCH64:
122 return "arm64";
123 case SCMP_ARCH_MIPS:
124 return "mips";
125 case SCMP_ARCH_MIPS64:
126 return "mips64";
127 case SCMP_ARCH_MIPS64N32:
128 return "mips64-n32";
129 case SCMP_ARCH_MIPSEL:
130 return "mips-le";
131 case SCMP_ARCH_MIPSEL64:
132 return "mips64-le";
133 case SCMP_ARCH_MIPSEL64N32:
134 return "mips64-le-n32";
135 case SCMP_ARCH_PPC:
136 return "ppc";
137 case SCMP_ARCH_PPC64:
138 return "ppc64";
139 case SCMP_ARCH_PPC64LE:
140 return "ppc64-le";
f9252236
AJ
141#ifdef SCMP_ARCH_RISCV64
142 case SCMP_ARCH_RISCV64:
143 return "riscv64";
144#endif
aa34055f 145 case SCMP_ARCH_S390:
6abfd303 146 return "s390";
aa34055f 147 case SCMP_ARCH_S390X:
6abfd303 148 return "s390x";
aa34055f
ZJS
149 default:
150 return NULL;
151 }
57183d11
LP
152}
153
154int seccomp_arch_from_string(const char *n, uint32_t *ret) {
155 if (!n)
156 return -EINVAL;
157
158 assert(ret);
159
160 if (streq(n, "native"))
161 *ret = SCMP_ARCH_NATIVE;
162 else if (streq(n, "x86"))
163 *ret = SCMP_ARCH_X86;
164 else if (streq(n, "x86-64"))
165 *ret = SCMP_ARCH_X86_64;
166 else if (streq(n, "x32"))
167 *ret = SCMP_ARCH_X32;
168 else if (streq(n, "arm"))
169 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
170 else if (streq(n, "arm64"))
171 *ret = SCMP_ARCH_AARCH64;
172 else if (streq(n, "mips"))
173 *ret = SCMP_ARCH_MIPS;
174 else if (streq(n, "mips64"))
175 *ret = SCMP_ARCH_MIPS64;
176 else if (streq(n, "mips64-n32"))
177 *ret = SCMP_ARCH_MIPS64N32;
178 else if (streq(n, "mips-le"))
179 *ret = SCMP_ARCH_MIPSEL;
180 else if (streq(n, "mips64-le"))
181 *ret = SCMP_ARCH_MIPSEL64;
182 else if (streq(n, "mips64-le-n32"))
183 *ret = SCMP_ARCH_MIPSEL64N32;
184 else if (streq(n, "ppc"))
185 *ret = SCMP_ARCH_PPC;
186 else if (streq(n, "ppc64"))
187 *ret = SCMP_ARCH_PPC64;
188 else if (streq(n, "ppc64-le"))
189 *ret = SCMP_ARCH_PPC64LE;
f9252236
AJ
190#ifdef SCMP_ARCH_RISCV64
191 else if (streq(n, "riscv64"))
192 *ret = SCMP_ARCH_RISCV64;
193#endif
6abfd303
HB
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
57183d11
LP
198 else
199 return -EINVAL;
200
201 return 0;
202}
e9642be2 203
469830d1 204int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 205 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
206 int r;
207
469830d1
LP
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
469830d1
LP
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
1b52793d 218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 219 if (r < 0)
b4eaa6cc 220 return r;
469830d1 221
1b52793d 222 r = seccomp_arch_add(seccomp, arch);
469830d1 223 if (r < 0)
b4eaa6cc 224 return r;
469830d1
LP
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 235 if (r < 0)
b4eaa6cc 236 return r;
8d7b0c8f
LP
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
b4eaa6cc 240 return r;
8d7b0c8f 241
44aaddad
SD
242#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
243 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
244 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
245 if (r < 0)
246 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
247 }
248#endif
249
b4eaa6cc 250 *ret = TAKE_PTR(seccomp);
8d7b0c8f 251 return 0;
8d7b0c8f
LP
252}
253
d347d902 254static bool is_basic_seccomp_available(void) {
4d5bd50a 255 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
256}
257
258static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
259 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
260 errno == EFAULT;
d347d902
FS
261}
262
83f12b27 263bool is_seccomp_available(void) {
83f12b27 264 static int cached_enabled = -1;
4d5bd50a 265
ce8f6d47
LP
266 if (cached_enabled < 0) {
267 int b;
268
269 b = getenv_bool_secure("SYSTEMD_SECCOMP");
270 if (b != 0) {
271 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
272 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
273
274 cached_enabled =
275 is_basic_seccomp_available() &&
276 is_seccomp_filter_available();
277 } else
278 cached_enabled = false;
279 }
4d5bd50a 280
83f12b27
FS
281 return cached_enabled;
282}
283
8130926d 284const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 285 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 286 .name = "@default",
d5efc18b 287 .help = "System calls that are always permitted",
40eb6a80 288 .value =
5f02870a 289 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
5abede32 290 "brk\0"
8e24b1d2 291 "cacheflush\0"
40eb6a80 292 "clock_getres\0"
6ca67710 293 "clock_getres_time64\0"
40eb6a80 294 "clock_gettime\0"
6ca67710 295 "clock_gettime64\0"
40eb6a80 296 "clock_nanosleep\0"
6ca67710 297 "clock_nanosleep_time64\0"
40eb6a80
ZJS
298 "execve\0"
299 "exit\0"
300 "exit_group\0"
e41b0f42 301 "futex\0"
6ca67710 302 "futex_time64\0"
e41b0f42
LP
303 "get_robust_list\0"
304 "get_thread_area\0"
09d3020b
DH
305 "getegid\0"
306 "getegid32\0"
307 "geteuid\0"
308 "geteuid32\0"
309 "getgid\0"
310 "getgid32\0"
311 "getgroups\0"
312 "getgroups32\0"
313 "getpgid\0"
314 "getpgrp\0"
315 "getpid\0"
316 "getppid\0"
14f4b1b5 317 "getrandom\0"
09d3020b
DH
318 "getresgid\0"
319 "getresgid32\0"
320 "getresuid\0"
321 "getresuid32\0"
40eb6a80 322 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
323 "getsid\0"
324 "gettid\0"
40eb6a80 325 "gettimeofday\0"
09d3020b
DH
326 "getuid\0"
327 "getuid32\0"
e41b0f42 328 "membarrier\0"
5abede32
LP
329 "mmap\0"
330 "mmap2\0"
47286254 331 "mprotect\0"
11b9105d 332 "munmap\0"
40eb6a80
ZJS
333 "nanosleep\0"
334 "pause\0"
4c3a9176 335 "prlimit64\0"
e41b0f42 336 "restart_syscall\0"
6fee3be0 337 "rseq\0"
40eb6a80 338 "rt_sigreturn\0"
7df660e4 339 "sched_getaffinity\0"
8f44de08 340 "sched_yield\0"
e41b0f42
LP
341 "set_robust_list\0"
342 "set_thread_area\0"
343 "set_tid_address\0"
ce5faeac 344 "set_tls\0"
40eb6a80
ZJS
345 "sigreturn\0"
346 "time\0"
4c3a9176 347 "ugetrlimit\0"
40eb6a80 348 },
44898c53
LP
349 [SYSCALL_FILTER_SET_AIO] = {
350 .name = "@aio",
351 .help = "Asynchronous IO",
352 .value =
353 "io_cancel\0"
354 "io_destroy\0"
355 "io_getevents\0"
a05cfe23 356 "io_pgetevents\0"
6ca67710 357 "io_pgetevents_time64\0"
44898c53
LP
358 "io_setup\0"
359 "io_submit\0"
9e486265
LP
360 "io_uring_enter\0"
361 "io_uring_register\0"
362 "io_uring_setup\0"
44898c53 363 },
133ddbbe 364 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 365 .name = "@basic-io",
d5efc18b 366 .help = "Basic IO",
133ddbbe 367 .value =
648a0ed0 368 "_llseek\0"
133ddbbe 369 "close\0"
6ea0d25c 370 "close_range\0"
648a0ed0 371 "dup\0"
133ddbbe
LP
372 "dup2\0"
373 "dup3\0"
133ddbbe
LP
374 "lseek\0"
375 "pread64\0"
376 "preadv\0"
44898c53 377 "preadv2\0"
133ddbbe
LP
378 "pwrite64\0"
379 "pwritev\0"
44898c53 380 "pwritev2\0"
133ddbbe
LP
381 "read\0"
382 "readv\0"
383 "write\0"
384 "writev\0"
385 },
44898c53
LP
386 [SYSCALL_FILTER_SET_CHOWN] = {
387 .name = "@chown",
388 .help = "Change ownership of files and directories",
389 .value =
390 "chown\0"
391 "chown32\0"
392 "fchown\0"
393 "fchown32\0"
394 "fchownat\0"
395 "lchown\0"
396 "lchown32\0"
397 },
8130926d 398 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 399 .name = "@clock",
d5efc18b 400 .help = "Change the system time",
201c1cc2
TM
401 .value =
402 "adjtimex\0"
1f9ac68b 403 "clock_adjtime\0"
6ca67710 404 "clock_adjtime64\0"
1f9ac68b 405 "clock_settime\0"
6ca67710 406 "clock_settime64\0"
201c1cc2 407 "settimeofday\0"
8130926d
LP
408 },
409 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 410 .name = "@cpu-emulation",
d5efc18b 411 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
412 .value =
413 "modify_ldt\0"
414 "subpage_prot\0"
415 "switch_endian\0"
416 "vm86\0"
417 "vm86old\0"
8130926d
LP
418 },
419 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 420 .name = "@debug",
d5efc18b 421 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
422 .value =
423 "lookup_dcookie\0"
424 "perf_event_open\0"
8270e3d8 425 "pidfd_getfd\0"
1f9ac68b
LP
426 "ptrace\0"
427 "rtas\0"
6da432fd 428#if defined __s390__ || defined __s390x__
1f9ac68b 429 "s390_runtime_instr\0"
8130926d 430#endif
1f9ac68b 431 "sys_debug_setcontext\0"
8130926d 432 },
1a1b13c9
LP
433 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
434 .name = "@file-system",
435 .help = "File system operations",
436 .value =
437 "access\0"
438 "chdir\0"
439 "chmod\0"
440 "close\0"
441 "creat\0"
442 "faccessat\0"
bcf08acb 443 "faccessat2\0"
1a1b13c9
LP
444 "fallocate\0"
445 "fchdir\0"
446 "fchmod\0"
447 "fchmodat\0"
1a1b13c9 448 "fcntl\0"
ceaa6aa7 449 "fcntl64\0"
1a1b13c9
LP
450 "fgetxattr\0"
451 "flistxattr\0"
ceaa6aa7 452 "fremovexattr\0"
1a1b13c9 453 "fsetxattr\0"
1a1b13c9 454 "fstat\0"
ceaa6aa7 455 "fstat64\0"
1a1b13c9 456 "fstatat64\0"
1a1b13c9 457 "fstatfs\0"
ceaa6aa7 458 "fstatfs64\0"
1a1b13c9 459 "ftruncate\0"
ceaa6aa7 460 "ftruncate64\0"
1a1b13c9
LP
461 "futimesat\0"
462 "getcwd\0"
1a1b13c9 463 "getdents\0"
ceaa6aa7 464 "getdents64\0"
1a1b13c9
LP
465 "getxattr\0"
466 "inotify_add_watch\0"
ceaa6aa7 467 "inotify_init\0"
1a1b13c9
LP
468 "inotify_init1\0"
469 "inotify_rm_watch\0"
470 "lgetxattr\0"
471 "link\0"
472 "linkat\0"
473 "listxattr\0"
474 "llistxattr\0"
475 "lremovexattr\0"
476 "lsetxattr\0"
1a1b13c9 477 "lstat\0"
ceaa6aa7 478 "lstat64\0"
1a1b13c9
LP
479 "mkdir\0"
480 "mkdirat\0"
481 "mknod\0"
482 "mknodat\0"
1a1b13c9 483 "newfstatat\0"
ceaa6aa7
LP
484 "oldfstat\0"
485 "oldlstat\0"
486 "oldstat\0"
1a1b13c9
LP
487 "open\0"
488 "openat\0"
8270e3d8 489 "openat2\0"
1a1b13c9
LP
490 "readlink\0"
491 "readlinkat\0"
492 "removexattr\0"
493 "rename\0"
1a1b13c9 494 "renameat\0"
ceaa6aa7 495 "renameat2\0"
1a1b13c9
LP
496 "rmdir\0"
497 "setxattr\0"
1a1b13c9 498 "stat\0"
ceaa6aa7 499 "stat64\0"
1a1b13c9 500 "statfs\0"
ceaa6aa7 501 "statfs64\0"
a4135a74 502 "statx\0"
1a1b13c9
LP
503 "symlink\0"
504 "symlinkat\0"
1a1b13c9 505 "truncate\0"
ceaa6aa7 506 "truncate64\0"
1a1b13c9
LP
507 "unlink\0"
508 "unlinkat\0"
ceaa6aa7 509 "utime\0"
1a1b13c9 510 "utimensat\0"
6ca67710 511 "utimensat_time64\0"
1a1b13c9
LP
512 "utimes\0"
513 },
8130926d 514 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 515 .name = "@io-event",
d5efc18b 516 .help = "Event loop system calls",
201c1cc2
TM
517 .value =
518 "_newselect\0"
201c1cc2 519 "epoll_create\0"
215728ff 520 "epoll_create1\0"
201c1cc2
TM
521 "epoll_ctl\0"
522 "epoll_ctl_old\0"
523 "epoll_pwait\0"
34254e59 524 "epoll_pwait2\0"
201c1cc2
TM
525 "epoll_wait\0"
526 "epoll_wait_old\0"
201c1cc2 527 "eventfd\0"
215728ff 528 "eventfd2\0"
201c1cc2
TM
529 "poll\0"
530 "ppoll\0"
6ca67710 531 "ppoll_time64\0"
201c1cc2 532 "pselect6\0"
6ca67710 533 "pselect6_time64\0"
201c1cc2 534 "select\0"
8130926d
LP
535 },
536 [SYSCALL_FILTER_SET_IPC] = {
8130926d 537 .name = "@ipc",
d5efc18b
ZJS
538 .help = "SysV IPC, POSIX Message Queues or other IPC",
539 .value =
540 "ipc\0"
cd5bfd7e 541 "memfd_create\0"
201c1cc2
TM
542 "mq_getsetattr\0"
543 "mq_notify\0"
544 "mq_open\0"
545 "mq_timedreceive\0"
6ca67710 546 "mq_timedreceive_time64\0"
201c1cc2 547 "mq_timedsend\0"
6ca67710 548 "mq_timedsend_time64\0"
201c1cc2
TM
549 "mq_unlink\0"
550 "msgctl\0"
551 "msgget\0"
552 "msgrcv\0"
553 "msgsnd\0"
cd5bfd7e 554 "pipe\0"
215728ff 555 "pipe2\0"
34254e59 556 "process_madvise\0"
201c1cc2
TM
557 "process_vm_readv\0"
558 "process_vm_writev\0"
559 "semctl\0"
560 "semget\0"
561 "semop\0"
562 "semtimedop\0"
6ca67710 563 "semtimedop_time64\0"
201c1cc2
TM
564 "shmat\0"
565 "shmctl\0"
566 "shmdt\0"
567 "shmget\0"
8130926d
LP
568 },
569 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 570 .name = "@keyring",
d5efc18b 571 .help = "Kernel keyring access",
1f9ac68b
LP
572 .value =
573 "add_key\0"
574 "keyctl\0"
575 "request_key\0"
8130926d 576 },
cd0ddf6f
LP
577 [SYSCALL_FILTER_SET_MEMLOCK] = {
578 .name = "@memlock",
579 .help = "Memory locking control",
580 .value =
581 "mlock\0"
582 "mlock2\0"
583 "mlockall\0"
584 "munlock\0"
585 "munlockall\0"
586 },
8130926d 587 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 588 .name = "@module",
d5efc18b 589 .help = "Loading and unloading of kernel modules",
201c1cc2 590 .value =
201c1cc2
TM
591 "delete_module\0"
592 "finit_module\0"
593 "init_module\0"
8130926d
LP
594 },
595 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 596 .name = "@mount",
d5efc18b 597 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
598 .value =
599 "chroot\0"
9e486265
LP
600 "fsconfig\0"
601 "fsmount\0"
602 "fsopen\0"
603 "fspick\0"
201c1cc2 604 "mount\0"
34254e59 605 "mount_setattr\0"
9e486265
LP
606 "move_mount\0"
607 "open_tree\0"
201c1cc2 608 "pivot_root\0"
201c1cc2 609 "umount\0"
215728ff 610 "umount2\0"
8130926d
LP
611 },
612 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 613 .name = "@network-io",
d5efc18b 614 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 615 .value =
201c1cc2 616 "accept\0"
215728ff 617 "accept4\0"
201c1cc2
TM
618 "bind\0"
619 "connect\0"
620 "getpeername\0"
621 "getsockname\0"
622 "getsockopt\0"
623 "listen\0"
624 "recv\0"
625 "recvfrom\0"
626 "recvmmsg\0"
6ca67710 627 "recvmmsg_time64\0"
201c1cc2
TM
628 "recvmsg\0"
629 "send\0"
630 "sendmmsg\0"
631 "sendmsg\0"
632 "sendto\0"
633 "setsockopt\0"
634 "shutdown\0"
635 "socket\0"
636 "socketcall\0"
637 "socketpair\0"
8130926d
LP
638 },
639 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 640 /* some unknown even to libseccomp */
8130926d 641 .name = "@obsolete",
d5efc18b 642 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
643 .value =
644 "_sysctl\0"
645 "afs_syscall\0"
802fa07a 646 "bdflush\0"
201c1cc2 647 "break\0"
1f9ac68b 648 "create_module\0"
201c1cc2
TM
649 "ftime\0"
650 "get_kernel_syms\0"
201c1cc2
TM
651 "getpmsg\0"
652 "gtty\0"
7e0c3b8f 653 "idle\0"
201c1cc2 654 "lock\0"
201c1cc2 655 "mpx\0"
201c1cc2
TM
656 "prof\0"
657 "profil\0"
201c1cc2
TM
658 "putpmsg\0"
659 "query_module\0"
201c1cc2
TM
660 "security\0"
661 "sgetmask\0"
662 "ssetmask\0"
ae5e9bf4 663 "stime\0"
201c1cc2 664 "stty\0"
1f9ac68b 665 "sysfs\0"
201c1cc2
TM
666 "tuxcall\0"
667 "ulimit\0"
668 "uselib\0"
1f9ac68b 669 "ustat\0"
201c1cc2 670 "vserver\0"
8130926d 671 },
9493b168
ZJS
672 [SYSCALL_FILTER_SET_PKEY] = {
673 .name = "@pkey",
674 .help = "System calls used for memory protection keys",
675 .value =
676 "pkey_alloc\0"
677 "pkey_free\0"
678 "pkey_mprotect\0"
679 },
8130926d 680 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 681 .name = "@privileged",
d5efc18b 682 .help = "All system calls which need super-user capabilities",
201c1cc2 683 .value =
44898c53 684 "@chown\0"
201c1cc2
TM
685 "@clock\0"
686 "@module\0"
687 "@raw-io\0"
af0f047b
LP
688 "@reboot\0"
689 "@swap\0"
215728ff 690 "_sysctl\0"
201c1cc2 691 "acct\0"
201c1cc2 692 "bpf\0"
1f9ac68b 693 "capset\0"
201c1cc2 694 "chroot\0"
a05cfe23 695 "fanotify_init\0"
9e486265 696 "fanotify_mark\0"
201c1cc2 697 "nfsservctl\0"
a05cfe23 698 "open_by_handle_at\0"
201c1cc2
TM
699 "pivot_root\0"
700 "quotactl\0"
201c1cc2 701 "setdomainname\0"
201c1cc2 702 "setfsuid\0"
215728ff 703 "setfsuid32\0"
201c1cc2 704 "setgroups\0"
215728ff 705 "setgroups32\0"
201c1cc2 706 "sethostname\0"
201c1cc2 707 "setresuid\0"
215728ff 708 "setresuid32\0"
201c1cc2 709 "setreuid\0"
215728ff 710 "setreuid32\0"
e05ee49b 711 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 712 "setuid32\0"
201c1cc2 713 "vhangup\0"
8130926d
LP
714 },
715 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 716 .name = "@process",
7b121df6 717 .help = "Process control, execution, namespacing operations",
201c1cc2 718 .value =
09d3020b 719 "capget\0" /* Able to query arbitrary processes */
201c1cc2 720 "clone\0"
9e486265 721 "clone3\0"
201c1cc2
TM
722 "execveat\0"
723 "fork\0"
b887d2eb 724 "getrusage\0"
201c1cc2 725 "kill\0"
9e486265 726 "pidfd_open\0"
46fcf95d 727 "pidfd_send_signal\0"
201c1cc2 728 "prctl\0"
b887d2eb
LP
729 "rt_sigqueueinfo\0"
730 "rt_tgsigqueueinfo\0"
201c1cc2 731 "setns\0"
a9518dc3 732 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 733 "tgkill\0"
b887d2eb 734 "times\0"
201c1cc2
TM
735 "tkill\0"
736 "unshare\0"
737 "vfork\0"
b887d2eb
LP
738 "wait4\0"
739 "waitid\0"
740 "waitpid\0"
8130926d
LP
741 },
742 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 743 .name = "@raw-io",
d5efc18b 744 .help = "Raw I/O port access",
201c1cc2
TM
745 .value =
746 "ioperm\0"
747 "iopl\0"
1f9ac68b 748 "pciconfig_iobase\0"
201c1cc2
TM
749 "pciconfig_read\0"
750 "pciconfig_write\0"
6da432fd 751#if defined __s390__ || defined __s390x__
201c1cc2
TM
752 "s390_pci_mmio_read\0"
753 "s390_pci_mmio_write\0"
8130926d
LP
754#endif
755 },
bd2ab3f4
LP
756 [SYSCALL_FILTER_SET_REBOOT] = {
757 .name = "@reboot",
758 .help = "Reboot and reboot preparation/kexec",
759 .value =
bd2ab3f4 760 "kexec_file_load\0"
e59608fa 761 "kexec_load\0"
bd2ab3f4
LP
762 "reboot\0"
763 },
133ddbbe 764 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 765 .name = "@resources",
58a8f68b 766 .help = "Alter resource settings",
133ddbbe 767 .value =
0963c053
LP
768 "ioprio_set\0"
769 "mbind\0"
770 "migrate_pages\0"
771 "move_pages\0"
772 "nice\0"
0963c053
LP
773 "sched_setaffinity\0"
774 "sched_setattr\0"
133ddbbe
LP
775 "sched_setparam\0"
776 "sched_setscheduler\0"
0963c053 777 "set_mempolicy\0"
133ddbbe
LP
778 "setpriority\0"
779 "setrlimit\0"
133ddbbe 780 },
6eaaeee9
LP
781 [SYSCALL_FILTER_SET_SETUID] = {
782 .name = "@setuid",
783 .help = "Operations for changing user/group credentials",
784 .value =
6eaaeee9 785 "setgid\0"
215728ff 786 "setgid32\0"
6eaaeee9 787 "setgroups\0"
215728ff 788 "setgroups32\0"
6eaaeee9 789 "setregid\0"
215728ff 790 "setregid32\0"
6eaaeee9 791 "setresgid\0"
215728ff 792 "setresgid32\0"
6eaaeee9 793 "setresuid\0"
215728ff 794 "setresuid32\0"
6eaaeee9 795 "setreuid\0"
215728ff 796 "setreuid32\0"
6eaaeee9 797 "setuid\0"
215728ff 798 "setuid32\0"
6eaaeee9 799 },
cd0ddf6f
LP
800 [SYSCALL_FILTER_SET_SIGNAL] = {
801 .name = "@signal",
802 .help = "Process signal handling",
803 .value =
804 "rt_sigaction\0"
805 "rt_sigpending\0"
806 "rt_sigprocmask\0"
807 "rt_sigsuspend\0"
808 "rt_sigtimedwait\0"
6ca67710 809 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
810 "sigaction\0"
811 "sigaltstack\0"
812 "signal\0"
813 "signalfd\0"
814 "signalfd4\0"
815 "sigpending\0"
816 "sigprocmask\0"
817 "sigsuspend\0"
818 },
bd2ab3f4
LP
819 [SYSCALL_FILTER_SET_SWAP] = {
820 .name = "@swap",
821 .help = "Enable/disable swap devices",
822 .value =
823 "swapoff\0"
824 "swapon\0"
825 },
44898c53
LP
826 [SYSCALL_FILTER_SET_SYNC] = {
827 .name = "@sync",
828 .help = "Synchronize files and memory to storage",
829 .value =
830 "fdatasync\0"
831 "fsync\0"
832 "msync\0"
833 "sync\0"
834 "sync_file_range\0"
a8fb09f5 835 "sync_file_range2\0"
44898c53
LP
836 "syncfs\0"
837 },
70526841
LP
838 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
839 .name = "@system-service",
840 .help = "General system service operations",
841 .value =
842 "@aio\0"
843 "@basic-io\0"
844 "@chown\0"
845 "@default\0"
846 "@file-system\0"
847 "@io-event\0"
848 "@ipc\0"
849 "@keyring\0"
850 "@memlock\0"
851 "@network-io\0"
852 "@process\0"
853 "@resources\0"
854 "@setuid\0"
855 "@signal\0"
856 "@sync\0"
857 "@timer\0"
70526841
LP
858 "capget\0"
859 "capset\0"
860 "copy_file_range\0"
861 "fadvise64\0"
862 "fadvise64_64\0"
863 "flock\0"
864 "get_mempolicy\0"
865 "getcpu\0"
866 "getpriority\0"
70526841
LP
867 "ioctl\0"
868 "ioprio_get\0"
869 "kcmp\0"
870 "madvise\0"
70526841
LP
871 "mremap\0"
872 "name_to_handle_at\0"
873 "oldolduname\0"
874 "olduname\0"
875 "personality\0"
876 "readahead\0"
877 "readdir\0"
878 "remap_file_pages\0"
879 "sched_get_priority_max\0"
880 "sched_get_priority_min\0"
70526841
LP
881 "sched_getattr\0"
882 "sched_getparam\0"
883 "sched_getscheduler\0"
884 "sched_rr_get_interval\0"
6ca67710 885 "sched_rr_get_interval_time64\0"
70526841
LP
886 "sched_yield\0"
887 "sendfile\0"
888 "sendfile64\0"
889 "setfsgid\0"
890 "setfsgid32\0"
891 "setfsuid\0"
892 "setfsuid32\0"
893 "setpgid\0"
894 "setsid\0"
895 "splice\0"
896 "sysinfo\0"
897 "tee\0"
898 "umask\0"
899 "uname\0"
900 "userfaultfd\0"
901 "vmsplice\0"
902 },
cd0ddf6f
LP
903 [SYSCALL_FILTER_SET_TIMER] = {
904 .name = "@timer",
905 .help = "Schedule operations by time",
906 .value =
907 "alarm\0"
908 "getitimer\0"
909 "setitimer\0"
910 "timer_create\0"
911 "timer_delete\0"
912 "timer_getoverrun\0"
913 "timer_gettime\0"
6ca67710 914 "timer_gettime64\0"
cd0ddf6f 915 "timer_settime\0"
6ca67710 916 "timer_settime64\0"
cd0ddf6f
LP
917 "timerfd_create\0"
918 "timerfd_gettime\0"
6ca67710 919 "timerfd_gettime64\0"
cd0ddf6f 920 "timerfd_settime\0"
6ca67710 921 "timerfd_settime64\0"
cd0ddf6f
LP
922 "times\0"
923 },
95aac012
ZJS
924 [SYSCALL_FILTER_SET_KNOWN] = {
925 .name = "@known",
926 .help = "All known syscalls declared in the kernel",
927 .value =
928#include "syscall-list.h"
929 },
201c1cc2 930};
8130926d
LP
931
932const SyscallFilterSet *syscall_filter_set_find(const char *name) {
8130926d
LP
933 if (isempty(name) || name[0] != '@')
934 return NULL;
935
077e8fc0 936 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
8130926d
LP
937 if (streq(syscall_filter_sets[i].name, name))
938 return syscall_filter_sets + i;
939
940 return NULL;
941}
942
000c0520
ZJS
943static int add_syscall_filter_set(
944 scmp_filter_ctx seccomp,
945 const SyscallFilterSet *set,
946 uint32_t action,
947 char **exclude,
948 bool log_missing,
949 char ***added);
950
951int seccomp_add_syscall_filter_item(
952 scmp_filter_ctx *seccomp,
953 const char *name,
954 uint32_t action,
955 char **exclude,
956 bool log_missing,
957 char ***added) {
69b1b241
LP
958
959 assert(seccomp);
960 assert(name);
961
960e4569
LP
962 if (strv_contains(exclude, name))
963 return 0;
964
000c0520
ZJS
965 /* Any syscalls that are handled are added to the *added strv. The pointer
966 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
967
69b1b241
LP
968 if (name[0] == '@') {
969 const SyscallFilterSet *other;
970
971 other = syscall_filter_set_find(name);
baaa35ad
ZJS
972 if (!other)
973 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
974 "Filter set %s is not known!",
975 name);
69b1b241 976
000c0520 977 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
b54f36c6 978
69b1b241 979 } else {
b54f36c6 980 int id, r;
69b1b241
LP
981
982 id = seccomp_syscall_resolve_name(name);
cff7bff8 983 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
984 if (log_missing)
985 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 986 return 0;
cff7bff8 987 }
69b1b241
LP
988
989 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 990 if (r < 0) {
69b1b241 991 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
992 bool ignore = r == -EDOM;
993
994 if (!ignore || log_missing)
995 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
996 name, id, ignore ? ", ignoring" : "");
997 if (!ignore)
998 return r;
b54f36c6 999 }
69b1b241 1000
000c0520
ZJS
1001 if (added) {
1002 r = strv_extend(added, name);
1003 if (r < 0)
1004 return r;
1005 }
1006
b54f36c6
ZJS
1007 return 0;
1008 }
69b1b241
LP
1009}
1010
000c0520 1011static int add_syscall_filter_set(
469830d1 1012 scmp_filter_ctx seccomp,
469830d1 1013 const SyscallFilterSet *set,
960e4569 1014 uint32_t action,
b54f36c6 1015 char **exclude,
000c0520
ZJS
1016 bool log_missing,
1017 char ***added) {
469830d1 1018
8130926d
LP
1019 const char *sys;
1020 int r;
1021
000c0520
ZJS
1022 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1023
8130926d
LP
1024 assert(seccomp);
1025 assert(set);
1026
1027 NULSTR_FOREACH(sys, set->value) {
000c0520 1028 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
69b1b241
LP
1029 if (r < 0)
1030 return r;
469830d1
LP
1031 }
1032
1033 return 0;
1034}
1035
b54f36c6 1036int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
1037 uint32_t arch;
1038 int r;
1039
1040 assert(set);
1041
1042 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 1043 * each local arch. */
469830d1
LP
1044
1045 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1046 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1047
1048 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1049
1050 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
1051 if (r < 0)
1052 return r;
469830d1 1053
000c0520 1054 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
7e86bd73
ZJS
1055 if (r < 0)
1056 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
1057
1058 r = seccomp_load(seccomp);
7bc5e0b1 1059 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1060 return r;
1061 if (r < 0)
1062 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
1063 }
1064
1065 return 0;
1066}
a3be2849 1067
1862b310 1068int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
469830d1 1069 uint32_t arch;
a3be2849
LP
1070 int r;
1071
1862b310
YW
1072 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1073 * of a SyscallFilterSet* table. */
a3be2849 1074
1862b310 1075 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
469830d1 1076 return 0;
a3be2849 1077
469830d1
LP
1078 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1079 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b54f36c6 1080 void *syscall_id, *val;
a3be2849 1081
469830d1 1082 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1083
469830d1
LP
1084 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1085 if (r < 0)
1086 return r;
a3be2849 1087
1862b310 1088 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
8cfa775f 1089 uint32_t a = action;
b54f36c6
ZJS
1090 int id = PTR_TO_INT(syscall_id) - 1;
1091 int error = PTR_TO_INT(val);
8cfa775f 1092
005bfaf1
TM
1093 if (error == SECCOMP_ERROR_NUMBER_KILL)
1094 a = scmp_act_kill_process();
9df2cdd8
TM
1095#ifdef SCMP_ACT_LOG
1096 else if (action == SCMP_ACT_LOG)
1097 a = SCMP_ACT_LOG;
1098#endif
68acc1af 1099 else if (error >= 0)
b54f36c6 1100 a = SCMP_ACT_ERRNO(error);
8cfa775f 1101
b54f36c6 1102 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1 1103 if (r < 0) {
1862b310
YW
1104 /* If the system call is not known on this architecture, then that's
1105 * fine, let's ignore it */
469830d1 1106 _cleanup_free_ char *n = NULL;
7e86bd73 1107 bool ignore;
469830d1 1108
b54f36c6 1109 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1110 ignore = r == -EDOM;
1111 if (!ignore || log_missing)
1112 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1113 strna(n), id, ignore ? ", ignoring" : "");
1114 if (!ignore)
1115 return r;
469830d1
LP
1116 }
1117 }
1118
1119 r = seccomp_load(seccomp);
7bc5e0b1 1120 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1121 return r;
1122 if (r < 0)
1862b310
YW
1123 log_debug_errno(r, "Failed to install systemc call filter for architecture %s, skipping: %m",
1124 seccomp_arch_to_string(arch));
469830d1
LP
1125 }
1126
1127 return 0;
add00535
LP
1128}
1129
58f6ab44 1130int seccomp_parse_syscall_filter(
898748d8
YW
1131 const char *name,
1132 int errno_num,
1133 Hashmap *filter,
13d92c63 1134 SeccompParseFlags flags,
898748d8
YW
1135 const char *unit,
1136 const char *filename,
1137 unsigned line) {
1138
1139 int r;
1140
1141 assert(name);
1142 assert(filter);
1143
084a46d7
YW
1144 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1145 return -EINVAL;
1146
898748d8
YW
1147 if (name[0] == '@') {
1148 const SyscallFilterSet *set;
1149 const char *i;
1150
1151 set = syscall_filter_set_find(name);
1152 if (!set) {
9e29ee40 1153 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1154 return -EINVAL;
13d92c63 1155
9e29ee40 1156 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
13d92c63
LP
1157 "Unknown system call group, ignoring: %s", name);
1158 return 0;
898748d8
YW
1159 }
1160
1161 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1162 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1163 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1164 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1165 * about them. */
58f6ab44 1166 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1167 if (r < 0)
1168 return r;
1169 }
1170 } else {
1171 int id;
1172
1173 id = seccomp_syscall_resolve_name(name);
1174 if (id == __NR_SCMP_ERROR) {
9e29ee40 1175 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
898748d8 1176 return -EINVAL;
13d92c63 1177
9e29ee40 1178 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
13d92c63
LP
1179 "Failed to parse system call, ignoring: %s", name);
1180 return 0;
898748d8
YW
1181 }
1182
68acc1af
YW
1183 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1184 * it from the list. The entries in allow-list with non-negative error value will be
1185 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1186 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1187 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
898748d8
YW
1188 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1189 if (r < 0)
851ee70a
LW
1190 switch (r) {
1191 case -ENOMEM:
9e29ee40 1192 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
851ee70a 1193 case -EEXIST:
9d7fe7c6
LW
1194 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1195 break;
851ee70a
LW
1196 default:
1197 return r;
1198 }
898748d8
YW
1199 } else
1200 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1201 }
1202
1203 return 0;
1204}
1205
add00535 1206int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1207 uint32_t arch;
add00535
LP
1208 int r;
1209
f1d34068 1210 if (DEBUG_LOGGING) {
add00535
LP
1211 _cleanup_free_ char *s = NULL;
1212
86c2a9f1 1213 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1214 log_debug("Restricting namespace to: %s.", strna(s));
1215 }
1216
1217 /* NOOP? */
d7a0f1f4 1218 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
add00535
LP
1219 return 0;
1220
469830d1
LP
1221 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1222 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
add00535 1223
469830d1
LP
1224 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1225
1226 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1227 if (r < 0)
1228 return r;
1229
1230 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1231 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1232 * altogether. */
1233 r = seccomp_rule_add_exact(
1234 seccomp,
1235 SCMP_ACT_ERRNO(EPERM),
1236 SCMP_SYS(setns),
1237 0);
1238 else
1239 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1240 * special invocation with a zero flags argument, right here. */
1241 r = seccomp_rule_add_exact(
1242 seccomp,
1243 SCMP_ACT_ERRNO(EPERM),
1244 SCMP_SYS(setns),
1245 1,
1246 SCMP_A1(SCMP_CMP_EQ, 0));
1247 if (r < 0) {
1248 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1249 continue;
1250 }
1251
077e8fc0 1252 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
469830d1
LP
1253 unsigned long f;
1254
1255 f = namespace_flag_map[i].flag;
d7a0f1f4 1256 if (FLAGS_SET(retain, f)) {
469830d1
LP
1257 log_debug("Permitting %s.", namespace_flag_map[i].name);
1258 continue;
1259 }
1260
1261 log_debug("Blocking %s.", namespace_flag_map[i].name);
1262
1263 r = seccomp_rule_add_exact(
1264 seccomp,
1265 SCMP_ACT_ERRNO(EPERM),
1266 SCMP_SYS(unshare),
1267 1,
1268 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1269 if (r < 0) {
1270 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1271 break;
1272 }
1273
511ceb1f
ZJS
1274 /* On s390/s390x the first two parameters to clone are switched */
1275 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1276 r = seccomp_rule_add_exact(
1277 seccomp,
1278 SCMP_ACT_ERRNO(EPERM),
1279 SCMP_SYS(clone),
1280 1,
1281 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1282 else
1283 r = seccomp_rule_add_exact(
1284 seccomp,
1285 SCMP_ACT_ERRNO(EPERM),
1286 SCMP_SYS(clone),
1287 1,
1288 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1289 if (r < 0) {
1290 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1291 break;
1292 }
1293
1294 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1295 r = seccomp_rule_add_exact(
1296 seccomp,
1297 SCMP_ACT_ERRNO(EPERM),
1298 SCMP_SYS(setns),
1299 1,
1300 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1301 if (r < 0) {
1302 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1303 break;
1304 }
1305 }
1306 }
1307 if (r < 0)
1308 continue;
1309
1310 r = seccomp_load(seccomp);
7bc5e0b1 1311 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1312 return r;
1313 if (r < 0)
1314 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1315 }
1316
1317 return 0;
1318}
1319
1320int seccomp_protect_sysctl(void) {
1321 uint32_t arch;
1322 int r;
1323
1324 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1325 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1326
1327 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1328
f9252236
AJ
1329 if (IN_SET(arch,
1330 SCMP_ARCH_AARCH64,
1331#ifdef SCMP_ARCH_RISCV64
1332 SCMP_ARCH_RISCV64,
1333#endif
1334 SCMP_ARCH_X32
1335 ))
2e64e8f4
ZJS
1336 /* No _sysctl syscall */
1337 continue;
1338
469830d1
LP
1339 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1340 if (r < 0)
1341 return r;
1342
1343 r = seccomp_rule_add_exact(
add00535
LP
1344 seccomp,
1345 SCMP_ACT_ERRNO(EPERM),
469830d1 1346 SCMP_SYS(_sysctl),
add00535 1347 0);
469830d1
LP
1348 if (r < 0) {
1349 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1350 continue;
1351 }
1352
1353 r = seccomp_load(seccomp);
7bc5e0b1 1354 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1355 return r;
1356 if (r < 0)
1357 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1358 }
1359
1360 return 0;
1361}
1362
620dbdd2
KK
1363int seccomp_protect_syslog(void) {
1364 uint32_t arch;
1365 int r;
1366
1367 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1368 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1369
1370 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1371 if (r < 0)
1372 return r;
1373
1374 r = seccomp_rule_add_exact(
1375 seccomp,
1376 SCMP_ACT_ERRNO(EPERM),
1377 SCMP_SYS(syslog),
1378 0);
1379
1380 if (r < 0) {
1381 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1382 continue;
1383 }
1384
1385 r = seccomp_load(seccomp);
1386 if (ERRNO_IS_SECCOMP_FATAL(r))
1387 return r;
1388 if (r < 0)
1389 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1390 }
1391
1392 return 0;
1393}
1394
6b000af4 1395int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1396 uint32_t arch;
1397 int r;
1398
1399 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1400 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1401 bool supported;
469830d1
LP
1402
1403 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1404
9606bc4b
LP
1405 switch (arch) {
1406
1407 case SCMP_ARCH_X86_64:
1408 case SCMP_ARCH_X32:
1409 case SCMP_ARCH_ARM:
1410 case SCMP_ARCH_AARCH64:
f5aeac14
JC
1411 case SCMP_ARCH_MIPSEL64N32:
1412 case SCMP_ARCH_MIPS64N32:
1413 case SCMP_ARCH_MIPSEL64:
1414 case SCMP_ARCH_MIPS64:
f9252236
AJ
1415#ifdef SCMP_ARCH_RISCV64
1416 case SCMP_ARCH_RISCV64:
1417#endif
9606bc4b
LP
1418 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1419 supported = true;
1420 break;
1421
9606bc4b
LP
1422 case SCMP_ARCH_S390:
1423 case SCMP_ARCH_S390X:
da1921a5 1424 case SCMP_ARCH_X86:
f5aeac14
JC
1425 case SCMP_ARCH_MIPSEL:
1426 case SCMP_ARCH_MIPS:
d5923e38
ZJS
1427 case SCMP_ARCH_PPC:
1428 case SCMP_ARCH_PPC64:
1429 case SCMP_ARCH_PPC64LE:
9606bc4b
LP
1430 default:
1431 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1432 * don't know */
1433 supported = false;
1434 break;
1435 }
1436
1437 if (!supported)
1438 continue;
1439
469830d1
LP
1440 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1441 if (r < 0)
1442 return r;
1443
6b000af4 1444 if (allow_list) {
077e8fc0 1445 int first = 0, last = 0;
469830d1
LP
1446 void *afp;
1447
6b000af4
LP
1448 /* If this is an allow list, we first block the address families that are out of
1449 * range and then everything that is not in the set. First, we find the lowest and
1450 * highest address family in the set. */
469830d1 1451
90e74a66 1452 SET_FOREACH(afp, address_families) {
077e8fc0 1453 int af = PTR_TO_INT(afp);
469830d1
LP
1454
1455 if (af <= 0 || af >= af_max())
1456 continue;
1457
1458 if (first == 0 || af < first)
1459 first = af;
1460
1461 if (last == 0 || af > last)
1462 last = af;
1463 }
1464
1465 assert((first == 0) == (last == 0));
1466
1467 if (first == 0) {
1468
1469 /* No entries in the valid range, block everything */
1470 r = seccomp_rule_add_exact(
1471 seccomp,
1472 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1473 SCMP_SYS(socket),
1474 0);
1475 if (r < 0) {
1476 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1477 continue;
1478 }
1479
1480 } else {
1481
1482 /* Block everything below the first entry */
1483 r = seccomp_rule_add_exact(
1484 seccomp,
1485 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1486 SCMP_SYS(socket),
1487 1,
1488 SCMP_A0(SCMP_CMP_LT, first));
1489 if (r < 0) {
1490 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1491 continue;
1492 }
1493
1494 /* Block everything above the last entry */
1495 r = seccomp_rule_add_exact(
1496 seccomp,
1497 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1498 SCMP_SYS(socket),
1499 1,
1500 SCMP_A0(SCMP_CMP_GT, last));
1501 if (r < 0) {
1502 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1503 continue;
1504 }
1505
1506 /* Block everything between the first and last entry */
077e8fc0 1507 for (int af = 1; af < af_max(); af++) {
469830d1
LP
1508
1509 if (set_contains(address_families, INT_TO_PTR(af)))
1510 continue;
1511
1512 r = seccomp_rule_add_exact(
1513 seccomp,
1514 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1515 SCMP_SYS(socket),
1516 1,
1517 SCMP_A0(SCMP_CMP_EQ, af));
1518 if (r < 0)
1519 break;
1520 }
469830d1
LP
1521 if (r < 0) {
1522 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1523 continue;
1524 }
1525 }
1526
1527 } else {
1528 void *af;
1529
6b000af4
LP
1530 /* If this is a deny list, then generate one rule for each address family that are
1531 * then combined in OR checks. */
469830d1 1532
90e74a66 1533 SET_FOREACH(af, address_families) {
469830d1
LP
1534 r = seccomp_rule_add_exact(
1535 seccomp,
1536 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1537 SCMP_SYS(socket),
1538 1,
1539 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1540 if (r < 0)
1541 break;
1542 }
469830d1
LP
1543 if (r < 0) {
1544 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1545 continue;
1546 }
1547 }
1548
1549 r = seccomp_load(seccomp);
7bc5e0b1 1550 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1551 return r;
1552 if (r < 0)
1553 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1554 }
1555
1556 return 0;
1557}
1558
1559int seccomp_restrict_realtime(void) {
1560 static const int permitted_policies[] = {
1561 SCHED_OTHER,
1562 SCHED_BATCH,
1563 SCHED_IDLE,
1564 };
1565
1566 int r, max_policy = 0;
1567 uint32_t arch;
1568 unsigned i;
1569
1570 /* Determine the highest policy constant we want to allow */
1571 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1572 if (permitted_policies[i] > max_policy)
1573 max_policy = permitted_policies[i];
1574
1575 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1576 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1577 int p;
1578
1579 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1580
1581 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1582 if (r < 0)
1583 return r;
1584
1585 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1586 * allow list. */
469830d1
LP
1587 for (p = 0; p < max_policy; p++) {
1588 bool good = false;
1589
6b000af4 1590 /* Check if this is in the allow list. */
469830d1
LP
1591 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1592 if (permitted_policies[i] == p) {
1593 good = true;
1594 break;
1595 }
1596
1597 if (good)
1598 continue;
1599
1600 /* Deny this policy */
1601 r = seccomp_rule_add_exact(
1602 seccomp,
1603 SCMP_ACT_ERRNO(EPERM),
1604 SCMP_SYS(sched_setscheduler),
1605 1,
1606 SCMP_A1(SCMP_CMP_EQ, p));
1607 if (r < 0) {
1608 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1609 continue;
1610 }
1611 }
1612
6b000af4
LP
1613 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1614 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1615 r = seccomp_rule_add_exact(
add00535
LP
1616 seccomp,
1617 SCMP_ACT_ERRNO(EPERM),
469830d1 1618 SCMP_SYS(sched_setscheduler),
add00535 1619 1,
469830d1
LP
1620 SCMP_A1(SCMP_CMP_GT, max_policy));
1621 if (r < 0) {
1622 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1623 continue;
1624 }
add00535 1625
469830d1 1626 r = seccomp_load(seccomp);
7bc5e0b1 1627 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1628 return r;
1629 if (r < 0)
1630 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1631 }
1632
1633 return 0;
1634}
1635
6dc66688
ZJS
1636static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1637 uint32_t arch,
1638 int nr,
14cb109d 1639 unsigned arg_cnt,
6dc66688
ZJS
1640 const struct scmp_arg_cmp arg) {
1641 int r;
1642
1643 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1644 if (r < 0) {
1645 _cleanup_free_ char *n = NULL;
1646
1647 n = seccomp_syscall_resolve_num_arch(arch, nr);
1648 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1649 strna(n),
1650 seccomp_arch_to_string(arch));
1651 }
1652
1653 return r;
1654}
1655
2a8d6e63 1656/* For known architectures, check that syscalls are indeed defined or not. */
f9252236 1657#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
2a8d6e63
ZJS
1658assert_cc(SCMP_SYS(shmget) > 0);
1659assert_cc(SCMP_SYS(shmat) > 0);
1660assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1661#endif
6dc66688 1662
469830d1
LP
1663int seccomp_memory_deny_write_execute(void) {
1664 uint32_t arch;
b069c2a3 1665 unsigned loaded = 0;
469830d1
LP
1666
1667 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1668 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1669 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1670
469830d1
LP
1671 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1672
8a50cf69
LP
1673 switch (arch) {
1674
bed4668d
CE
1675 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1676 * We ignore that here, which means there's still a way to get writable/executable
1677 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1678
8a50cf69 1679 case SCMP_ARCH_X86:
57311925 1680 case SCMP_ARCH_S390:
8a50cf69
LP
1681 filter_syscall = SCMP_SYS(mmap2);
1682 block_syscall = SCMP_SYS(mmap);
bed4668d 1683 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1684 break;
1685
63d00dfb 1686 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1687 case SCMP_ARCH_PPC64:
1688 case SCMP_ARCH_PPC64LE:
bed4668d 1689 case SCMP_ARCH_S390X:
2a8d6e63 1690 filter_syscall = SCMP_SYS(mmap);
bed4668d 1691 /* shmat multiplexed, see above */
8a50cf69
LP
1692 break;
1693
4278d1f5
ZJS
1694 case SCMP_ARCH_ARM:
1695 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1696 shmat_syscall = SCMP_SYS(shmat);
1697 break;
1698
8a50cf69
LP
1699 case SCMP_ARCH_X86_64:
1700 case SCMP_ARCH_X32:
79873bc8 1701 case SCMP_ARCH_AARCH64:
f9252236
AJ
1702#ifdef SCMP_ARCH_RISCV64
1703 case SCMP_ARCH_RISCV64:
1704#endif
1705 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
8a50cf69
LP
1706 shmat_syscall = SCMP_SYS(shmat);
1707 break;
1708
1709 /* Please add more definitions here, if you port systemd to other architectures! */
1710
f9252236 1711#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
8a50cf69
LP
1712#warning "Consider adding the right mmap() syscall definitions here!"
1713#endif
1714 }
1715
1716 /* Can't filter mmap() on this arch, then skip it */
1717 if (filter_syscall == 0)
1718 continue;
1719
469830d1
LP
1720 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1721 if (r < 0)
1722 return r;
1723
6dc66688
ZJS
1724 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1725 1,
1726 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1727 if (r < 0)
1728 continue;
8a50cf69
LP
1729
1730 if (block_syscall != 0) {
6dc66688
ZJS
1731 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1732 if (r < 0)
8a50cf69 1733 continue;
add00535 1734 }
a3be2849 1735
6dc66688
ZJS
1736 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1737 1,
b835eeb4
ZJS
1738 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1739 if (r < 0)
1740 continue;
1741
1742 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1743 1,
6dc66688
ZJS
1744 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1745 if (r < 0)
469830d1 1746 continue;
add00535 1747
67fb5f33 1748 if (shmat_syscall > 0) {
5ef3ed97 1749 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1750 1,
1751 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1752 if (r < 0)
8a50cf69 1753 continue;
469830d1
LP
1754 }
1755
1756 r = seccomp_load(seccomp);
7bc5e0b1 1757 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1 1758 return r;
add00535 1759 if (r < 0)
b069c2a3
ZJS
1760 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1761 seccomp_arch_to_string(arch));
903659e7 1762 loaded++;
469830d1 1763 }
add00535 1764
903659e7 1765 if (loaded == 0)
b069c2a3 1766 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1767
1768 return loaded;
469830d1
LP
1769}
1770
1771int seccomp_restrict_archs(Set *archs) {
1772 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
469830d1 1773 int r;
65976868 1774 bool blocked_new = false;
469830d1
LP
1775
1776 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1777 * list.
1778 *
1779 * There are some qualifications. However the most important use is to stop processes from bypassing
1780 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1781 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1782
2428aaf8
AJ
1783 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1784 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1785 * to run a program with the restrictions applied. */
469830d1
LP
1786 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1787 if (!seccomp)
1788 return -ENOMEM;
1789
65976868
GDF
1790 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1791 uint32_t arch = seccomp_local_archs[i];
2428aaf8 1792
f833df38
BB
1793 /* See above comment, our "native" architecture is never blocked. */
1794 if (arch == seccomp_arch_native())
1795 continue;
1796
65976868
GDF
1797 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1798 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1799 continue;
2428aaf8 1800
65976868 1801 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
2428aaf8 1802
65976868
GDF
1803 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1804 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1805 * The important thing is that you can block the old 32-bit x86 syscalls.
1806 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1807 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1808 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1809
1810 if (block) {
1811 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1812 blocked_new = true;
1813 } else {
1814 r = seccomp_arch_add(seccomp, arch);
1815 if (r < 0 && r != -EEXIST)
1816 return r;
1817 }
add00535
LP
1818 }
1819
65976868
GDF
1820 /* All architectures that will be blocked by the seccomp program were
1821 * already blocked. */
1822 if (!blocked_new)
1823 return 0;
1824
469830d1
LP
1825 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1826 if (r < 0)
1827 return r;
add00535 1828
1c6af69b 1829 r = seccomp_load(seccomp);
7bc5e0b1 1830 if (ERRNO_IS_SECCOMP_FATAL(r))
1c6af69b
LP
1831 return r;
1832 if (r < 0)
1833 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1834
1835 return 0;
a3be2849 1836}
b16bd535 1837
de7fef4b
ZJS
1838int parse_syscall_archs(char **l, Set **ret_archs) {
1839 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
1840 int r;
1841
1842 assert(l);
de7fef4b 1843 assert(ret_archs);
b16bd535
YW
1844
1845 STRV_FOREACH(s, l) {
1846 uint32_t a;
1847
1848 r = seccomp_arch_from_string(*s, &a);
1849 if (r < 0)
1850 return -EINVAL;
1851
de7fef4b 1852 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
1853 if (r < 0)
1854 return -ENOMEM;
1855 }
1856
de7fef4b 1857 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
1858 return 0;
1859}
165a31c0 1860
8cfa775f 1861int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1862 const char *i;
1863 int r;
1864
1865 assert(set);
1866
1867 NULSTR_FOREACH(i, set->value) {
1868
1869 if (i[0] == '@') {
1870 const SyscallFilterSet *more;
1871
1872 more = syscall_filter_set_find(i);
1873 if (!more)
1874 return -ENXIO;
1875
165a31c0
LP
1876 r = seccomp_filter_set_add(filter, add, more);
1877 if (r < 0)
1878 return r;
1879 } else {
1880 int id;
1881
1882 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1883 if (id == __NR_SCMP_ERROR) {
1884 log_debug("Couldn't resolve system call, ignoring: %s", i);
1885 continue;
1886 }
165a31c0
LP
1887
1888 if (add) {
8cfa775f 1889 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1890 if (r < 0)
1891 return r;
1892 } else
8cfa775f 1893 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1894 }
1895 }
1896
1897 return 0;
1898}
78e864e5
TM
1899
1900int seccomp_lock_personality(unsigned long personality) {
72eafe71 1901 uint32_t arch;
78e864e5
TM
1902 int r;
1903
72eafe71
LP
1904 if (personality >= PERSONALITY_INVALID)
1905 return -EINVAL;
78e864e5 1906
72eafe71
LP
1907 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1908 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1909
72eafe71
LP
1910 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1911 if (r < 0)
1912 return r;
1913
1914 r = seccomp_rule_add_exact(
1915 seccomp,
1916 SCMP_ACT_ERRNO(EPERM),
1917 SCMP_SYS(personality),
1918 1,
1919 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1920 if (r < 0) {
1921 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1922 continue;
1923 }
72eafe71
LP
1924
1925 r = seccomp_load(seccomp);
7bc5e0b1 1926 if (ERRNO_IS_SECCOMP_FATAL(r))
72eafe71
LP
1927 return r;
1928 if (r < 0)
1929 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1930 }
1931
1932 return 0;
78e864e5 1933}
aecd5ac6
TM
1934
1935int seccomp_protect_hostname(void) {
1936 uint32_t arch;
1937 int r;
1938
1939 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1940 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1941
1942 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1943 if (r < 0)
1944 return r;
1945
1946 r = seccomp_rule_add_exact(
1947 seccomp,
1948 SCMP_ACT_ERRNO(EPERM),
1949 SCMP_SYS(sethostname),
1950 0);
9e6e543c
LP
1951 if (r < 0) {
1952 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1953 continue;
9e6e543c 1954 }
aecd5ac6
TM
1955
1956 r = seccomp_rule_add_exact(
1957 seccomp,
1958 SCMP_ACT_ERRNO(EPERM),
1959 SCMP_SYS(setdomainname),
1960 0);
9e6e543c
LP
1961 if (r < 0) {
1962 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1963 continue;
9e6e543c 1964 }
aecd5ac6
TM
1965
1966 r = seccomp_load(seccomp);
7bc5e0b1 1967 if (ERRNO_IS_SECCOMP_FATAL(r))
aecd5ac6
TM
1968 return r;
1969 if (r < 0)
1970 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1971 }
1972
1973 return 0;
1974}
3c27973b 1975
da4dc9a6
ZJS
1976static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1977 /* Checks the mode_t parameter of the following system calls:
1978 *
1979 * → chmod() + fchmod() + fchmodat()
1980 * → open() + creat() + openat()
1981 * → mkdir() + mkdirat()
1982 * → mknod() + mknodat()
1983 *
1984 * Returns error if *everything* failed, and 0 otherwise.
1985 */
6d95e7d9 1986 int r;
da4dc9a6
ZJS
1987 bool any = false;
1988
1989 r = seccomp_rule_add_exact(
1990 seccomp,
1991 SCMP_ACT_ERRNO(EPERM),
1992 SCMP_SYS(chmod),
1993 1,
1994 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1995 if (r < 0)
1996 log_debug_errno(r, "Failed to add filter for chmod: %m");
1997 else
1998 any = true;
1999
2000 r = seccomp_rule_add_exact(
2001 seccomp,
2002 SCMP_ACT_ERRNO(EPERM),
2003 SCMP_SYS(fchmod),
2004 1,
2005 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2006 if (r < 0)
2007 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2008 else
2009 any = true;
2010
2011 r = seccomp_rule_add_exact(
2012 seccomp,
2013 SCMP_ACT_ERRNO(EPERM),
2014 SCMP_SYS(fchmodat),
2015 1,
2016 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2017 if (r < 0)
2018 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2019 else
2020 any = true;
2021
2022 r = seccomp_rule_add_exact(
2023 seccomp,
2024 SCMP_ACT_ERRNO(EPERM),
2025 SCMP_SYS(mkdir),
2026 1,
2027 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2028 if (r < 0)
2029 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2030 else
2031 any = true;
2032
2033 r = seccomp_rule_add_exact(
2034 seccomp,
2035 SCMP_ACT_ERRNO(EPERM),
2036 SCMP_SYS(mkdirat),
2037 1,
2038 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2039 if (r < 0)
2040 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2041 else
2042 any = true;
2043
2044 r = seccomp_rule_add_exact(
2045 seccomp,
2046 SCMP_ACT_ERRNO(EPERM),
2047 SCMP_SYS(mknod),
2048 1,
2049 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2050 if (r < 0)
2051 log_debug_errno(r, "Failed to add filter for mknod: %m");
2052 else
2053 any = true;
2054
2055 r = seccomp_rule_add_exact(
2056 seccomp,
2057 SCMP_ACT_ERRNO(EPERM),
2058 SCMP_SYS(mknodat),
2059 1,
2060 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2061 if (r < 0)
2062 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2063 else
2064 any = true;
2065
da4dc9a6
ZJS
2066 r = seccomp_rule_add_exact(
2067 seccomp,
2068 SCMP_ACT_ERRNO(EPERM),
2069 SCMP_SYS(open),
2070 2,
2071 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2072 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2073 if (r < 0)
2074 log_debug_errno(r, "Failed to add filter for open: %m");
2075 else
2076 any = true;
da4dc9a6
ZJS
2077
2078 r = seccomp_rule_add_exact(
2079 seccomp,
2080 SCMP_ACT_ERRNO(EPERM),
2081 SCMP_SYS(openat),
2082 2,
2083 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2084 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2085 if (r < 0)
2086 log_debug_errno(r, "Failed to add filter for openat: %m");
2087 else
2088 any = true;
2089
ecc04067
LP
2090#if defined(__SNR_openat2)
2091 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2092 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2093 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
57353d29
MG
2094 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2095 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2096 * to call open() or openat() instead. We can properly enforce policy for those functions. */
ecc04067
LP
2097 r = seccomp_rule_add_exact(
2098 seccomp,
57353d29 2099 SCMP_ACT_ERRNO(ENOSYS),
ecc04067
LP
2100 SCMP_SYS(openat2),
2101 0);
2102 if (r < 0)
2103 log_debug_errno(r, "Failed to add filter for openat2: %m");
2104 else
2105 any = true;
2106#endif
2107
da4dc9a6
ZJS
2108 r = seccomp_rule_add_exact(
2109 seccomp,
2110 SCMP_ACT_ERRNO(EPERM),
2111 SCMP_SYS(creat),
2112 1,
2113 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2114 if (r < 0)
2115 log_debug_errno(r, "Failed to add filter for creat: %m");
2116 else
2117 any = true;
2118
2119 return any ? 0 : r;
2120}
2121
3c27973b
LP
2122int seccomp_restrict_suid_sgid(void) {
2123 uint32_t arch;
da4dc9a6 2124 int r, k;
3c27973b
LP
2125
2126 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2127 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2128
2129 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2130 if (r < 0)
2131 return r;
2132
da4dc9a6
ZJS
2133 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2134 if (r < 0)
2135 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2136
da4dc9a6
ZJS
2137 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2138 if (k < 0)
2139 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2140
da4dc9a6 2141 if (r < 0 && k < 0)
3c27973b 2142 continue;
3c27973b
LP
2143
2144 r = seccomp_load(seccomp);
7bc5e0b1 2145 if (ERRNO_IS_SECCOMP_FATAL(r))
3c27973b
LP
2146 return r;
2147 if (r < 0)
2148 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2149 }
2150
2151 return 0;
2152}
915fb324
LP
2153
2154uint32_t scmp_act_kill_process(void) {
2155
2156 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2157 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2158 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2159 * for single-threaded apps does the right thing. */
2160
2161#ifdef SCMP_ACT_KILL_PROCESS
2162 if (seccomp_api_get() >= 3)
2163 return SCMP_ACT_KILL_PROCESS;
2164#endif
2165
2166 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2167}
22eadc28
YW
2168
2169int parse_syscall_and_errno(const char *in, char **name, int *error) {
2170 _cleanup_free_ char *n = NULL;
2171 char *p;
2172 int e = -1;
2173
2174 assert(in);
2175 assert(name);
2176 assert(error);
2177
2178 /*
2179 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2180 * If errno is omitted, then error is set to -1.
2181 * Empty syscall name is not allowed.
2182 * Here, we do not check that the syscall name is valid or not.
2183 */
2184
2185 p = strchr(in, ':');
2186 if (p) {
2187 e = seccomp_parse_errno_or_action(p + 1);
2188 if (e < 0)
2189 return e;
2190
2191 n = strndup(in, p - in);
2192 } else
2193 n = strdup(in);
2194
2195 if (!n)
2196 return -ENOMEM;
2197
2198 if (isempty(n))
2199 return -EINVAL;
2200
2201 *error = e;
2202 *name = TAKE_PTR(n);
2203
2204 return 0;
2205}
4a4654e0
LP
2206
2207static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2208 bool any = false;
2209 int r;
2210
2211 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2212 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2213
4a4654e0
LP
2214 r = seccomp_rule_add_exact(
2215 seccomp,
2216 SCMP_ACT_ERRNO(EINVAL),
2217 SCMP_SYS(open),
2218 1,
2219 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2220 if (r < 0)
2221 log_debug_errno(r, "Failed to add filter for open: %m");
2222 else
2223 any = true;
4a4654e0
LP
2224
2225 r = seccomp_rule_add_exact(
2226 seccomp,
2227 SCMP_ACT_ERRNO(EINVAL),
2228 SCMP_SYS(openat),
2229 1,
2230 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2231 if (r < 0)
2232 log_debug_errno(r, "Failed to add filter for openat: %m");
2233 else
2234 any = true;
2235
2236#if defined(__SNR_openat2)
2237 /* The new openat2() system call can't be filtered sensibly, see above. */
2238 r = seccomp_rule_add_exact(
2239 seccomp,
2240 SCMP_ACT_ERRNO(ENOSYS),
2241 SCMP_SYS(openat2),
2242 0);
2243 if (r < 0)
2244 log_debug_errno(r, "Failed to add filter for openat2: %m");
2245 else
2246 any = true;
2247#endif
2248
2249 return any ? 0 : r;
2250}
2251
2252int seccomp_suppress_sync(void) {
2253 uint32_t arch;
2254 int r;
2255
2256 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2257 * manageable, and also masks O_SYNC/O_DSYNC */
2258
2259 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2260 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2261 const char *c;
2262
2263 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2264 if (r < 0)
2265 return r;
2266
2267 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2268 int id;
2269
2270 id = seccomp_syscall_resolve_name(c);
2271 if (id == __NR_SCMP_ERROR) {
2272 log_debug("System call %s is not known, ignoring.", c);
2273 continue;
2274 }
2275
2276 r = seccomp_rule_add_exact(
2277 seccomp,
2278 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2279 id,
2280 0);
2281 if (r < 0)
2282 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2283 }
2284
2285 (void) block_open_flag(seccomp, O_SYNC);
2286#if O_DSYNC != O_SYNC
2287 (void) block_open_flag(seccomp, O_DSYNC);
2288#endif
2289
2290 r = seccomp_load(seccomp);
2291 if (ERRNO_IS_SECCOMP_FATAL(r))
2292 return r;
2293 if (r < 0)
2294 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2295 }
2296
2297 return 0;
2298}