]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
mmap-cache: bind prot(ection) to MMapFileDescriptor
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
57183d11 6#include <seccomp.h>
a8fbdf54 7#include <stddef.h>
469830d1 8#include <sys/mman.h>
d347d902 9#include <sys/prctl.h>
469830d1 10#include <sys/shm.h>
3c27973b 11#include <sys/stat.h>
57183d11 12
469830d1 13#include "af-list.h"
add00535 14#include "alloc-util.h"
44aaddad 15#include "env-util.h"
d8b4d14d 16#include "errno-list.h"
a8fbdf54 17#include "macro.h"
add00535 18#include "nsflags.h"
d8b4d14d 19#include "nulstr-util.h"
78e864e5 20#include "process-util.h"
cf0fbc49 21#include "seccomp-util.h"
b16bd535 22#include "set.h"
07630cea 23#include "string-util.h"
b16bd535 24#include "strv.h"
469830d1
LP
25
26const uint32_t seccomp_local_archs[] = {
27
6b000af4 28 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
29
30#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
31 SCMP_ARCH_X86,
32 SCMP_ARCH_X86_64,
f2d9751c
LP
33 SCMP_ARCH_X32, /* native */
34#elif defined(__x86_64__) && !defined(__ILP32__)
35 SCMP_ARCH_X86,
469830d1 36 SCMP_ARCH_X32,
f2d9751c
LP
37 SCMP_ARCH_X86_64, /* native */
38#elif defined(__i386__)
39 SCMP_ARCH_X86,
40#elif defined(__aarch64__)
469830d1 41 SCMP_ARCH_ARM,
f2d9751c
LP
42 SCMP_ARCH_AARCH64, /* native */
43#elif defined(__arm__)
44 SCMP_ARCH_ARM,
45#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPSEL,
47 SCMP_ARCH_MIPS, /* native */
48#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 49 SCMP_ARCH_MIPS,
f2d9751c
LP
50 SCMP_ARCH_MIPSEL, /* native */
51#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
52 SCMP_ARCH_MIPSEL,
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL64N32,
469830d1 55 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
56 SCMP_ARCH_MIPSEL64,
57 SCMP_ARCH_MIPS64, /* native */
58#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPS,
469830d1 60 SCMP_ARCH_MIPSEL,
f2d9751c
LP
61 SCMP_ARCH_MIPS64N32,
62 SCMP_ARCH_MIPSEL64N32,
63 SCMP_ARCH_MIPS64,
64 SCMP_ARCH_MIPSEL64, /* native */
65#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
469830d1 68 SCMP_ARCH_MIPSEL64,
f2d9751c 69 SCMP_ARCH_MIPS64,
469830d1 70 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
71 SCMP_ARCH_MIPS64N32, /* native */
72#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64,
76 SCMP_ARCH_MIPSEL64,
77 SCMP_ARCH_MIPS64N32,
78 SCMP_ARCH_MIPSEL64N32, /* native */
79#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 80 SCMP_ARCH_PPC,
469830d1 81 SCMP_ARCH_PPC64LE,
f2d9751c
LP
82 SCMP_ARCH_PPC64, /* native */
83#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
84 SCMP_ARCH_PPC,
85 SCMP_ARCH_PPC64,
86 SCMP_ARCH_PPC64LE, /* native */
87#elif defined(__powerpc__)
88 SCMP_ARCH_PPC,
f9252236
AJ
89#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
90 SCMP_ARCH_RISCV64,
f2d9751c
LP
91#elif defined(__s390x__)
92 SCMP_ARCH_S390,
93 SCMP_ARCH_S390X, /* native */
94#elif defined(__s390__)
469830d1 95 SCMP_ARCH_S390,
469830d1
LP
96#endif
97 (uint32_t) -1
98 };
57183d11
LP
99
100const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
101 /* Maintain order used in <seccomp.h>.
102 *
103 * Names used here should be the same as those used for ConditionArchitecture=,
104 * except for "subarchitectures" like x32. */
57183d11 105
aa34055f
ZJS
106 switch(c) {
107 case SCMP_ARCH_NATIVE:
57183d11 108 return "native";
aa34055f 109 case SCMP_ARCH_X86:
57183d11 110 return "x86";
aa34055f 111 case SCMP_ARCH_X86_64:
57183d11 112 return "x86-64";
aa34055f 113 case SCMP_ARCH_X32:
57183d11 114 return "x32";
aa34055f 115 case SCMP_ARCH_ARM:
57183d11 116 return "arm";
aa34055f
ZJS
117 case SCMP_ARCH_AARCH64:
118 return "arm64";
119 case SCMP_ARCH_MIPS:
120 return "mips";
121 case SCMP_ARCH_MIPS64:
122 return "mips64";
123 case SCMP_ARCH_MIPS64N32:
124 return "mips64-n32";
125 case SCMP_ARCH_MIPSEL:
126 return "mips-le";
127 case SCMP_ARCH_MIPSEL64:
128 return "mips64-le";
129 case SCMP_ARCH_MIPSEL64N32:
130 return "mips64-le-n32";
131 case SCMP_ARCH_PPC:
132 return "ppc";
133 case SCMP_ARCH_PPC64:
134 return "ppc64";
135 case SCMP_ARCH_PPC64LE:
136 return "ppc64-le";
f9252236
AJ
137#ifdef SCMP_ARCH_RISCV64
138 case SCMP_ARCH_RISCV64:
139 return "riscv64";
140#endif
aa34055f 141 case SCMP_ARCH_S390:
6abfd303 142 return "s390";
aa34055f 143 case SCMP_ARCH_S390X:
6abfd303 144 return "s390x";
aa34055f
ZJS
145 default:
146 return NULL;
147 }
57183d11
LP
148}
149
150int seccomp_arch_from_string(const char *n, uint32_t *ret) {
151 if (!n)
152 return -EINVAL;
153
154 assert(ret);
155
156 if (streq(n, "native"))
157 *ret = SCMP_ARCH_NATIVE;
158 else if (streq(n, "x86"))
159 *ret = SCMP_ARCH_X86;
160 else if (streq(n, "x86-64"))
161 *ret = SCMP_ARCH_X86_64;
162 else if (streq(n, "x32"))
163 *ret = SCMP_ARCH_X32;
164 else if (streq(n, "arm"))
165 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
166 else if (streq(n, "arm64"))
167 *ret = SCMP_ARCH_AARCH64;
168 else if (streq(n, "mips"))
169 *ret = SCMP_ARCH_MIPS;
170 else if (streq(n, "mips64"))
171 *ret = SCMP_ARCH_MIPS64;
172 else if (streq(n, "mips64-n32"))
173 *ret = SCMP_ARCH_MIPS64N32;
174 else if (streq(n, "mips-le"))
175 *ret = SCMP_ARCH_MIPSEL;
176 else if (streq(n, "mips64-le"))
177 *ret = SCMP_ARCH_MIPSEL64;
178 else if (streq(n, "mips64-le-n32"))
179 *ret = SCMP_ARCH_MIPSEL64N32;
180 else if (streq(n, "ppc"))
181 *ret = SCMP_ARCH_PPC;
182 else if (streq(n, "ppc64"))
183 *ret = SCMP_ARCH_PPC64;
184 else if (streq(n, "ppc64-le"))
185 *ret = SCMP_ARCH_PPC64LE;
f9252236
AJ
186#ifdef SCMP_ARCH_RISCV64
187 else if (streq(n, "riscv64"))
188 *ret = SCMP_ARCH_RISCV64;
189#endif
6abfd303
HB
190 else if (streq(n, "s390"))
191 *ret = SCMP_ARCH_S390;
192 else if (streq(n, "s390x"))
193 *ret = SCMP_ARCH_S390X;
57183d11
LP
194 else
195 return -EINVAL;
196
197 return 0;
198}
e9642be2 199
469830d1 200int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 201 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
202 int r;
203
469830d1
LP
204 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
205 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
206
207 seccomp = seccomp_init(default_action);
208 if (!seccomp)
209 return -ENOMEM;
210
469830d1
LP
211 if (arch != SCMP_ARCH_NATIVE &&
212 arch != seccomp_arch_native()) {
213
1b52793d 214 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 215 if (r < 0)
b4eaa6cc 216 return r;
469830d1 217
1b52793d 218 r = seccomp_arch_add(seccomp, arch);
469830d1 219 if (r < 0)
b4eaa6cc 220 return r;
469830d1
LP
221
222 assert(seccomp_arch_exist(seccomp, arch) >= 0);
223 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
224 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
225 } else {
226 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
227 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
228 }
229
230 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 231 if (r < 0)
b4eaa6cc 232 return r;
8d7b0c8f
LP
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
235 if (r < 0)
b4eaa6cc 236 return r;
8d7b0c8f 237
44aaddad
SD
238#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
239 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
240 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
241 if (r < 0)
242 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
243 }
244#endif
245
b4eaa6cc 246 *ret = TAKE_PTR(seccomp);
8d7b0c8f 247 return 0;
8d7b0c8f
LP
248}
249
d347d902 250static bool is_basic_seccomp_available(void) {
4d5bd50a 251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
252}
253
254static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
d347d902
FS
257}
258
83f12b27 259bool is_seccomp_available(void) {
83f12b27 260 static int cached_enabled = -1;
4d5bd50a 261
ce8f6d47
LP
262 if (cached_enabled < 0) {
263 int b;
264
265 b = getenv_bool_secure("SYSTEMD_SECCOMP");
266 if (b != 0) {
267 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
268 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
269
270 cached_enabled =
271 is_basic_seccomp_available() &&
272 is_seccomp_filter_available();
273 } else
274 cached_enabled = false;
275 }
4d5bd50a 276
83f12b27
FS
277 return cached_enabled;
278}
279
8130926d 280const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 281 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 282 .name = "@default",
d5efc18b 283 .help = "System calls that are always permitted",
40eb6a80 284 .value =
5abede32 285 "brk\0"
8e24b1d2 286 "cacheflush\0"
40eb6a80 287 "clock_getres\0"
6ca67710 288 "clock_getres_time64\0"
40eb6a80 289 "clock_gettime\0"
6ca67710 290 "clock_gettime64\0"
40eb6a80 291 "clock_nanosleep\0"
6ca67710 292 "clock_nanosleep_time64\0"
40eb6a80
ZJS
293 "execve\0"
294 "exit\0"
295 "exit_group\0"
e41b0f42 296 "futex\0"
6ca67710 297 "futex_time64\0"
e41b0f42
LP
298 "get_robust_list\0"
299 "get_thread_area\0"
09d3020b
DH
300 "getegid\0"
301 "getegid32\0"
302 "geteuid\0"
303 "geteuid32\0"
304 "getgid\0"
305 "getgid32\0"
306 "getgroups\0"
307 "getgroups32\0"
308 "getpgid\0"
309 "getpgrp\0"
310 "getpid\0"
311 "getppid\0"
312 "getresgid\0"
313 "getresgid32\0"
314 "getresuid\0"
315 "getresuid32\0"
40eb6a80 316 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
317 "getsid\0"
318 "gettid\0"
40eb6a80 319 "gettimeofday\0"
09d3020b
DH
320 "getuid\0"
321 "getuid32\0"
e41b0f42 322 "membarrier\0"
5abede32
LP
323 "mmap\0"
324 "mmap2\0"
11b9105d 325 "munmap\0"
40eb6a80
ZJS
326 "nanosleep\0"
327 "pause\0"
4c3a9176 328 "prlimit64\0"
e41b0f42 329 "restart_syscall\0"
6fee3be0 330 "rseq\0"
40eb6a80 331 "rt_sigreturn\0"
8f44de08 332 "sched_yield\0"
e41b0f42
LP
333 "set_robust_list\0"
334 "set_thread_area\0"
335 "set_tid_address\0"
ce5faeac 336 "set_tls\0"
40eb6a80
ZJS
337 "sigreturn\0"
338 "time\0"
4c3a9176 339 "ugetrlimit\0"
40eb6a80 340 },
44898c53
LP
341 [SYSCALL_FILTER_SET_AIO] = {
342 .name = "@aio",
343 .help = "Asynchronous IO",
344 .value =
345 "io_cancel\0"
346 "io_destroy\0"
347 "io_getevents\0"
a05cfe23 348 "io_pgetevents\0"
6ca67710 349 "io_pgetevents_time64\0"
44898c53
LP
350 "io_setup\0"
351 "io_submit\0"
9e486265
LP
352 "io_uring_enter\0"
353 "io_uring_register\0"
354 "io_uring_setup\0"
44898c53 355 },
133ddbbe 356 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 357 .name = "@basic-io",
d5efc18b 358 .help = "Basic IO",
133ddbbe 359 .value =
648a0ed0 360 "_llseek\0"
133ddbbe 361 "close\0"
6ea0d25c 362 "close_range\0"
648a0ed0 363 "dup\0"
133ddbbe
LP
364 "dup2\0"
365 "dup3\0"
133ddbbe
LP
366 "lseek\0"
367 "pread64\0"
368 "preadv\0"
44898c53 369 "preadv2\0"
133ddbbe
LP
370 "pwrite64\0"
371 "pwritev\0"
44898c53 372 "pwritev2\0"
133ddbbe
LP
373 "read\0"
374 "readv\0"
375 "write\0"
376 "writev\0"
377 },
44898c53
LP
378 [SYSCALL_FILTER_SET_CHOWN] = {
379 .name = "@chown",
380 .help = "Change ownership of files and directories",
381 .value =
382 "chown\0"
383 "chown32\0"
384 "fchown\0"
385 "fchown32\0"
386 "fchownat\0"
387 "lchown\0"
388 "lchown32\0"
389 },
8130926d 390 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 391 .name = "@clock",
d5efc18b 392 .help = "Change the system time",
201c1cc2
TM
393 .value =
394 "adjtimex\0"
1f9ac68b 395 "clock_adjtime\0"
6ca67710 396 "clock_adjtime64\0"
1f9ac68b 397 "clock_settime\0"
6ca67710 398 "clock_settime64\0"
201c1cc2 399 "settimeofday\0"
8130926d
LP
400 },
401 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 402 .name = "@cpu-emulation",
d5efc18b 403 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
404 .value =
405 "modify_ldt\0"
406 "subpage_prot\0"
407 "switch_endian\0"
408 "vm86\0"
409 "vm86old\0"
8130926d
LP
410 },
411 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 412 .name = "@debug",
d5efc18b 413 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
414 .value =
415 "lookup_dcookie\0"
416 "perf_event_open\0"
8270e3d8 417 "pidfd_getfd\0"
1f9ac68b
LP
418 "ptrace\0"
419 "rtas\0"
6da432fd 420#if defined __s390__ || defined __s390x__
1f9ac68b 421 "s390_runtime_instr\0"
8130926d 422#endif
1f9ac68b 423 "sys_debug_setcontext\0"
8130926d 424 },
1a1b13c9
LP
425 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
426 .name = "@file-system",
427 .help = "File system operations",
428 .value =
429 "access\0"
430 "chdir\0"
431 "chmod\0"
432 "close\0"
433 "creat\0"
434 "faccessat\0"
bcf08acb 435 "faccessat2\0"
1a1b13c9
LP
436 "fallocate\0"
437 "fchdir\0"
438 "fchmod\0"
439 "fchmodat\0"
1a1b13c9 440 "fcntl\0"
ceaa6aa7 441 "fcntl64\0"
1a1b13c9
LP
442 "fgetxattr\0"
443 "flistxattr\0"
ceaa6aa7 444 "fremovexattr\0"
1a1b13c9 445 "fsetxattr\0"
1a1b13c9 446 "fstat\0"
ceaa6aa7 447 "fstat64\0"
1a1b13c9 448 "fstatat64\0"
1a1b13c9 449 "fstatfs\0"
ceaa6aa7 450 "fstatfs64\0"
1a1b13c9 451 "ftruncate\0"
ceaa6aa7 452 "ftruncate64\0"
1a1b13c9
LP
453 "futimesat\0"
454 "getcwd\0"
1a1b13c9 455 "getdents\0"
ceaa6aa7 456 "getdents64\0"
1a1b13c9
LP
457 "getxattr\0"
458 "inotify_add_watch\0"
ceaa6aa7 459 "inotify_init\0"
1a1b13c9
LP
460 "inotify_init1\0"
461 "inotify_rm_watch\0"
462 "lgetxattr\0"
463 "link\0"
464 "linkat\0"
465 "listxattr\0"
466 "llistxattr\0"
467 "lremovexattr\0"
468 "lsetxattr\0"
1a1b13c9 469 "lstat\0"
ceaa6aa7 470 "lstat64\0"
1a1b13c9
LP
471 "mkdir\0"
472 "mkdirat\0"
473 "mknod\0"
474 "mknodat\0"
1a1b13c9 475 "newfstatat\0"
ceaa6aa7
LP
476 "oldfstat\0"
477 "oldlstat\0"
478 "oldstat\0"
1a1b13c9
LP
479 "open\0"
480 "openat\0"
8270e3d8 481 "openat2\0"
1a1b13c9
LP
482 "readlink\0"
483 "readlinkat\0"
484 "removexattr\0"
485 "rename\0"
1a1b13c9 486 "renameat\0"
ceaa6aa7 487 "renameat2\0"
1a1b13c9
LP
488 "rmdir\0"
489 "setxattr\0"
1a1b13c9 490 "stat\0"
ceaa6aa7 491 "stat64\0"
1a1b13c9 492 "statfs\0"
ceaa6aa7 493 "statfs64\0"
a4135a74 494 "statx\0"
1a1b13c9
LP
495 "symlink\0"
496 "symlinkat\0"
1a1b13c9 497 "truncate\0"
ceaa6aa7 498 "truncate64\0"
1a1b13c9
LP
499 "unlink\0"
500 "unlinkat\0"
ceaa6aa7 501 "utime\0"
1a1b13c9 502 "utimensat\0"
6ca67710 503 "utimensat_time64\0"
1a1b13c9
LP
504 "utimes\0"
505 },
8130926d 506 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 507 .name = "@io-event",
d5efc18b 508 .help = "Event loop system calls",
201c1cc2
TM
509 .value =
510 "_newselect\0"
201c1cc2 511 "epoll_create\0"
215728ff 512 "epoll_create1\0"
201c1cc2
TM
513 "epoll_ctl\0"
514 "epoll_ctl_old\0"
515 "epoll_pwait\0"
516 "epoll_wait\0"
517 "epoll_wait_old\0"
201c1cc2 518 "eventfd\0"
215728ff 519 "eventfd2\0"
201c1cc2
TM
520 "poll\0"
521 "ppoll\0"
6ca67710 522 "ppoll_time64\0"
201c1cc2 523 "pselect6\0"
6ca67710 524 "pselect6_time64\0"
201c1cc2 525 "select\0"
8130926d
LP
526 },
527 [SYSCALL_FILTER_SET_IPC] = {
8130926d 528 .name = "@ipc",
d5efc18b
ZJS
529 .help = "SysV IPC, POSIX Message Queues or other IPC",
530 .value =
531 "ipc\0"
cd5bfd7e 532 "memfd_create\0"
201c1cc2
TM
533 "mq_getsetattr\0"
534 "mq_notify\0"
535 "mq_open\0"
536 "mq_timedreceive\0"
6ca67710 537 "mq_timedreceive_time64\0"
201c1cc2 538 "mq_timedsend\0"
6ca67710 539 "mq_timedsend_time64\0"
201c1cc2
TM
540 "mq_unlink\0"
541 "msgctl\0"
542 "msgget\0"
543 "msgrcv\0"
544 "msgsnd\0"
cd5bfd7e 545 "pipe\0"
215728ff 546 "pipe2\0"
201c1cc2
TM
547 "process_vm_readv\0"
548 "process_vm_writev\0"
549 "semctl\0"
550 "semget\0"
551 "semop\0"
552 "semtimedop\0"
6ca67710 553 "semtimedop_time64\0"
201c1cc2
TM
554 "shmat\0"
555 "shmctl\0"
556 "shmdt\0"
557 "shmget\0"
8130926d
LP
558 },
559 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 560 .name = "@keyring",
d5efc18b 561 .help = "Kernel keyring access",
1f9ac68b
LP
562 .value =
563 "add_key\0"
564 "keyctl\0"
565 "request_key\0"
8130926d 566 },
cd0ddf6f
LP
567 [SYSCALL_FILTER_SET_MEMLOCK] = {
568 .name = "@memlock",
569 .help = "Memory locking control",
570 .value =
571 "mlock\0"
572 "mlock2\0"
573 "mlockall\0"
574 "munlock\0"
575 "munlockall\0"
576 },
8130926d 577 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 578 .name = "@module",
d5efc18b 579 .help = "Loading and unloading of kernel modules",
201c1cc2 580 .value =
201c1cc2
TM
581 "delete_module\0"
582 "finit_module\0"
583 "init_module\0"
8130926d
LP
584 },
585 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 586 .name = "@mount",
d5efc18b 587 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
588 .value =
589 "chroot\0"
9e486265
LP
590 "fsconfig\0"
591 "fsmount\0"
592 "fsopen\0"
593 "fspick\0"
201c1cc2 594 "mount\0"
9e486265
LP
595 "move_mount\0"
596 "open_tree\0"
201c1cc2 597 "pivot_root\0"
201c1cc2 598 "umount\0"
215728ff 599 "umount2\0"
8130926d
LP
600 },
601 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 602 .name = "@network-io",
d5efc18b 603 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 604 .value =
201c1cc2 605 "accept\0"
215728ff 606 "accept4\0"
201c1cc2
TM
607 "bind\0"
608 "connect\0"
609 "getpeername\0"
610 "getsockname\0"
611 "getsockopt\0"
612 "listen\0"
613 "recv\0"
614 "recvfrom\0"
615 "recvmmsg\0"
6ca67710 616 "recvmmsg_time64\0"
201c1cc2
TM
617 "recvmsg\0"
618 "send\0"
619 "sendmmsg\0"
620 "sendmsg\0"
621 "sendto\0"
622 "setsockopt\0"
623 "shutdown\0"
624 "socket\0"
625 "socketcall\0"
626 "socketpair\0"
8130926d
LP
627 },
628 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 629 /* some unknown even to libseccomp */
8130926d 630 .name = "@obsolete",
d5efc18b 631 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
632 .value =
633 "_sysctl\0"
634 "afs_syscall\0"
802fa07a 635 "bdflush\0"
201c1cc2 636 "break\0"
1f9ac68b 637 "create_module\0"
201c1cc2
TM
638 "ftime\0"
639 "get_kernel_syms\0"
201c1cc2
TM
640 "getpmsg\0"
641 "gtty\0"
7e0c3b8f 642 "idle\0"
201c1cc2 643 "lock\0"
201c1cc2 644 "mpx\0"
201c1cc2
TM
645 "prof\0"
646 "profil\0"
201c1cc2
TM
647 "putpmsg\0"
648 "query_module\0"
201c1cc2
TM
649 "security\0"
650 "sgetmask\0"
651 "ssetmask\0"
ae5e9bf4 652 "stime\0"
201c1cc2 653 "stty\0"
1f9ac68b 654 "sysfs\0"
201c1cc2
TM
655 "tuxcall\0"
656 "ulimit\0"
657 "uselib\0"
1f9ac68b 658 "ustat\0"
201c1cc2 659 "vserver\0"
8130926d 660 },
9493b168
ZJS
661 [SYSCALL_FILTER_SET_PKEY] = {
662 .name = "@pkey",
663 .help = "System calls used for memory protection keys",
664 .value =
665 "pkey_alloc\0"
666 "pkey_free\0"
667 "pkey_mprotect\0"
668 },
8130926d 669 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 670 .name = "@privileged",
d5efc18b 671 .help = "All system calls which need super-user capabilities",
201c1cc2 672 .value =
44898c53 673 "@chown\0"
201c1cc2
TM
674 "@clock\0"
675 "@module\0"
676 "@raw-io\0"
af0f047b
LP
677 "@reboot\0"
678 "@swap\0"
215728ff 679 "_sysctl\0"
201c1cc2 680 "acct\0"
201c1cc2 681 "bpf\0"
1f9ac68b 682 "capset\0"
201c1cc2 683 "chroot\0"
a05cfe23 684 "fanotify_init\0"
9e486265 685 "fanotify_mark\0"
201c1cc2 686 "nfsservctl\0"
a05cfe23 687 "open_by_handle_at\0"
201c1cc2
TM
688 "pivot_root\0"
689 "quotactl\0"
201c1cc2 690 "setdomainname\0"
201c1cc2 691 "setfsuid\0"
215728ff 692 "setfsuid32\0"
201c1cc2 693 "setgroups\0"
215728ff 694 "setgroups32\0"
201c1cc2 695 "sethostname\0"
201c1cc2 696 "setresuid\0"
215728ff 697 "setresuid32\0"
201c1cc2 698 "setreuid\0"
215728ff 699 "setreuid32\0"
e05ee49b 700 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 701 "setuid32\0"
201c1cc2 702 "vhangup\0"
8130926d
LP
703 },
704 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 705 .name = "@process",
7b121df6 706 .help = "Process control, execution, namespacing operations",
201c1cc2
TM
707 .value =
708 "arch_prctl\0"
09d3020b 709 "capget\0" /* Able to query arbitrary processes */
201c1cc2 710 "clone\0"
9e486265 711 "clone3\0"
201c1cc2
TM
712 "execveat\0"
713 "fork\0"
b887d2eb 714 "getrusage\0"
201c1cc2 715 "kill\0"
9e486265 716 "pidfd_open\0"
46fcf95d 717 "pidfd_send_signal\0"
201c1cc2 718 "prctl\0"
b887d2eb
LP
719 "rt_sigqueueinfo\0"
720 "rt_tgsigqueueinfo\0"
201c1cc2 721 "setns\0"
a9518dc3 722 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 723 "tgkill\0"
b887d2eb 724 "times\0"
201c1cc2
TM
725 "tkill\0"
726 "unshare\0"
727 "vfork\0"
b887d2eb
LP
728 "wait4\0"
729 "waitid\0"
730 "waitpid\0"
8130926d
LP
731 },
732 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 733 .name = "@raw-io",
d5efc18b 734 .help = "Raw I/O port access",
201c1cc2
TM
735 .value =
736 "ioperm\0"
737 "iopl\0"
1f9ac68b 738 "pciconfig_iobase\0"
201c1cc2
TM
739 "pciconfig_read\0"
740 "pciconfig_write\0"
6da432fd 741#if defined __s390__ || defined __s390x__
201c1cc2
TM
742 "s390_pci_mmio_read\0"
743 "s390_pci_mmio_write\0"
8130926d
LP
744#endif
745 },
bd2ab3f4
LP
746 [SYSCALL_FILTER_SET_REBOOT] = {
747 .name = "@reboot",
748 .help = "Reboot and reboot preparation/kexec",
749 .value =
bd2ab3f4 750 "kexec_file_load\0"
e59608fa 751 "kexec_load\0"
bd2ab3f4
LP
752 "reboot\0"
753 },
133ddbbe 754 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 755 .name = "@resources",
58a8f68b 756 .help = "Alter resource settings",
133ddbbe 757 .value =
0963c053
LP
758 "ioprio_set\0"
759 "mbind\0"
760 "migrate_pages\0"
761 "move_pages\0"
762 "nice\0"
0963c053
LP
763 "sched_setaffinity\0"
764 "sched_setattr\0"
133ddbbe
LP
765 "sched_setparam\0"
766 "sched_setscheduler\0"
0963c053 767 "set_mempolicy\0"
133ddbbe
LP
768 "setpriority\0"
769 "setrlimit\0"
133ddbbe 770 },
6eaaeee9
LP
771 [SYSCALL_FILTER_SET_SETUID] = {
772 .name = "@setuid",
773 .help = "Operations for changing user/group credentials",
774 .value =
6eaaeee9 775 "setgid\0"
215728ff 776 "setgid32\0"
6eaaeee9 777 "setgroups\0"
215728ff 778 "setgroups32\0"
6eaaeee9 779 "setregid\0"
215728ff 780 "setregid32\0"
6eaaeee9 781 "setresgid\0"
215728ff 782 "setresgid32\0"
6eaaeee9 783 "setresuid\0"
215728ff 784 "setresuid32\0"
6eaaeee9 785 "setreuid\0"
215728ff 786 "setreuid32\0"
6eaaeee9 787 "setuid\0"
215728ff 788 "setuid32\0"
6eaaeee9 789 },
cd0ddf6f
LP
790 [SYSCALL_FILTER_SET_SIGNAL] = {
791 .name = "@signal",
792 .help = "Process signal handling",
793 .value =
794 "rt_sigaction\0"
795 "rt_sigpending\0"
796 "rt_sigprocmask\0"
797 "rt_sigsuspend\0"
798 "rt_sigtimedwait\0"
6ca67710 799 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
800 "sigaction\0"
801 "sigaltstack\0"
802 "signal\0"
803 "signalfd\0"
804 "signalfd4\0"
805 "sigpending\0"
806 "sigprocmask\0"
807 "sigsuspend\0"
808 },
bd2ab3f4
LP
809 [SYSCALL_FILTER_SET_SWAP] = {
810 .name = "@swap",
811 .help = "Enable/disable swap devices",
812 .value =
813 "swapoff\0"
814 "swapon\0"
815 },
44898c53
LP
816 [SYSCALL_FILTER_SET_SYNC] = {
817 .name = "@sync",
818 .help = "Synchronize files and memory to storage",
819 .value =
820 "fdatasync\0"
821 "fsync\0"
822 "msync\0"
823 "sync\0"
824 "sync_file_range\0"
a8fb09f5 825 "sync_file_range2\0"
44898c53
LP
826 "syncfs\0"
827 },
70526841
LP
828 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
829 .name = "@system-service",
830 .help = "General system service operations",
831 .value =
832 "@aio\0"
833 "@basic-io\0"
834 "@chown\0"
835 "@default\0"
836 "@file-system\0"
837 "@io-event\0"
838 "@ipc\0"
839 "@keyring\0"
840 "@memlock\0"
841 "@network-io\0"
842 "@process\0"
843 "@resources\0"
844 "@setuid\0"
845 "@signal\0"
846 "@sync\0"
847 "@timer\0"
70526841
LP
848 "capget\0"
849 "capset\0"
850 "copy_file_range\0"
851 "fadvise64\0"
852 "fadvise64_64\0"
853 "flock\0"
854 "get_mempolicy\0"
855 "getcpu\0"
856 "getpriority\0"
857 "getrandom\0"
858 "ioctl\0"
859 "ioprio_get\0"
860 "kcmp\0"
861 "madvise\0"
70526841
LP
862 "mprotect\0"
863 "mremap\0"
864 "name_to_handle_at\0"
865 "oldolduname\0"
866 "olduname\0"
867 "personality\0"
868 "readahead\0"
869 "readdir\0"
870 "remap_file_pages\0"
871 "sched_get_priority_max\0"
872 "sched_get_priority_min\0"
873 "sched_getaffinity\0"
874 "sched_getattr\0"
875 "sched_getparam\0"
876 "sched_getscheduler\0"
877 "sched_rr_get_interval\0"
6ca67710 878 "sched_rr_get_interval_time64\0"
70526841
LP
879 "sched_yield\0"
880 "sendfile\0"
881 "sendfile64\0"
882 "setfsgid\0"
883 "setfsgid32\0"
884 "setfsuid\0"
885 "setfsuid32\0"
886 "setpgid\0"
887 "setsid\0"
888 "splice\0"
889 "sysinfo\0"
890 "tee\0"
891 "umask\0"
892 "uname\0"
893 "userfaultfd\0"
894 "vmsplice\0"
895 },
cd0ddf6f
LP
896 [SYSCALL_FILTER_SET_TIMER] = {
897 .name = "@timer",
898 .help = "Schedule operations by time",
899 .value =
900 "alarm\0"
901 "getitimer\0"
902 "setitimer\0"
903 "timer_create\0"
904 "timer_delete\0"
905 "timer_getoverrun\0"
906 "timer_gettime\0"
6ca67710 907 "timer_gettime64\0"
cd0ddf6f 908 "timer_settime\0"
6ca67710 909 "timer_settime64\0"
cd0ddf6f
LP
910 "timerfd_create\0"
911 "timerfd_gettime\0"
6ca67710 912 "timerfd_gettime64\0"
cd0ddf6f 913 "timerfd_settime\0"
6ca67710 914 "timerfd_settime64\0"
cd0ddf6f
LP
915 "times\0"
916 },
95aac012
ZJS
917 [SYSCALL_FILTER_SET_KNOWN] = {
918 .name = "@known",
919 .help = "All known syscalls declared in the kernel",
920 .value =
921#include "syscall-list.h"
922 },
201c1cc2 923};
8130926d
LP
924
925const SyscallFilterSet *syscall_filter_set_find(const char *name) {
8130926d
LP
926 if (isempty(name) || name[0] != '@')
927 return NULL;
928
077e8fc0 929 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
8130926d
LP
930 if (streq(syscall_filter_sets[i].name, name))
931 return syscall_filter_sets + i;
932
933 return NULL;
934}
935
000c0520
ZJS
936static int add_syscall_filter_set(
937 scmp_filter_ctx seccomp,
938 const SyscallFilterSet *set,
939 uint32_t action,
940 char **exclude,
941 bool log_missing,
942 char ***added);
943
944int seccomp_add_syscall_filter_item(
945 scmp_filter_ctx *seccomp,
946 const char *name,
947 uint32_t action,
948 char **exclude,
949 bool log_missing,
950 char ***added) {
69b1b241
LP
951
952 assert(seccomp);
953 assert(name);
954
960e4569
LP
955 if (strv_contains(exclude, name))
956 return 0;
957
000c0520
ZJS
958 /* Any syscalls that are handled are added to the *added strv. The pointer
959 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
960
69b1b241
LP
961 if (name[0] == '@') {
962 const SyscallFilterSet *other;
963
964 other = syscall_filter_set_find(name);
baaa35ad
ZJS
965 if (!other)
966 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
967 "Filter set %s is not known!",
968 name);
69b1b241 969
000c0520 970 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
b54f36c6 971
69b1b241 972 } else {
b54f36c6 973 int id, r;
69b1b241
LP
974
975 id = seccomp_syscall_resolve_name(name);
cff7bff8 976 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
977 if (log_missing)
978 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 979 return 0;
cff7bff8 980 }
69b1b241
LP
981
982 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 983 if (r < 0) {
69b1b241 984 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
985 bool ignore = r == -EDOM;
986
987 if (!ignore || log_missing)
988 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
989 name, id, ignore ? ", ignoring" : "");
990 if (!ignore)
991 return r;
b54f36c6 992 }
69b1b241 993
000c0520
ZJS
994 if (added) {
995 r = strv_extend(added, name);
996 if (r < 0)
997 return r;
998 }
999
b54f36c6
ZJS
1000 return 0;
1001 }
69b1b241
LP
1002}
1003
000c0520 1004static int add_syscall_filter_set(
469830d1 1005 scmp_filter_ctx seccomp,
469830d1 1006 const SyscallFilterSet *set,
960e4569 1007 uint32_t action,
b54f36c6 1008 char **exclude,
000c0520
ZJS
1009 bool log_missing,
1010 char ***added) {
469830d1 1011
8130926d
LP
1012 const char *sys;
1013 int r;
1014
000c0520
ZJS
1015 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1016
8130926d
LP
1017 assert(seccomp);
1018 assert(set);
1019
1020 NULSTR_FOREACH(sys, set->value) {
000c0520 1021 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
69b1b241
LP
1022 if (r < 0)
1023 return r;
469830d1
LP
1024 }
1025
1026 return 0;
1027}
1028
b54f36c6 1029int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
1030 uint32_t arch;
1031 int r;
1032
1033 assert(set);
1034
1035 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 1036 * each local arch. */
469830d1
LP
1037
1038 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1039 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1040
1041 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1042
1043 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
1044 if (r < 0)
1045 return r;
469830d1 1046
000c0520 1047 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
7e86bd73
ZJS
1048 if (r < 0)
1049 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
1050
1051 r = seccomp_load(seccomp);
7bc5e0b1 1052 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1053 return r;
1054 if (r < 0)
1055 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
1056 }
1057
1058 return 0;
1059}
a3be2849 1060
b54f36c6 1061int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 1062 uint32_t arch;
a3be2849
LP
1063 int r;
1064
469830d1
LP
1065 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1066 * SyscallFilterSet* table. */
a3be2849 1067
8cfa775f 1068 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 1069 return 0;
a3be2849 1070
469830d1
LP
1071 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1072 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b54f36c6 1073 void *syscall_id, *val;
a3be2849 1074
469830d1 1075 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1076
469830d1
LP
1077 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1078 if (r < 0)
1079 return r;
a3be2849 1080
90e74a66 1081 HASHMAP_FOREACH_KEY(val, syscall_id, set) {
8cfa775f 1082 uint32_t a = action;
b54f36c6
ZJS
1083 int id = PTR_TO_INT(syscall_id) - 1;
1084 int error = PTR_TO_INT(val);
8cfa775f 1085
005bfaf1
TM
1086 if (error == SECCOMP_ERROR_NUMBER_KILL)
1087 a = scmp_act_kill_process();
9df2cdd8
TM
1088#ifdef SCMP_ACT_LOG
1089 else if (action == SCMP_ACT_LOG)
1090 a = SCMP_ACT_LOG;
1091#endif
005bfaf1 1092 else if (action != SCMP_ACT_ALLOW && error >= 0)
b54f36c6 1093 a = SCMP_ACT_ERRNO(error);
8cfa775f 1094
b54f36c6 1095 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
1096 if (r < 0) {
1097 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1098 _cleanup_free_ char *n = NULL;
7e86bd73 1099 bool ignore;
469830d1 1100
b54f36c6 1101 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1102 ignore = r == -EDOM;
1103 if (!ignore || log_missing)
1104 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1105 strna(n), id, ignore ? ", ignoring" : "");
1106 if (!ignore)
1107 return r;
469830d1
LP
1108 }
1109 }
1110
1111 r = seccomp_load(seccomp);
7bc5e0b1 1112 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1113 return r;
1114 if (r < 0)
1115 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1116 }
1117
1118 return 0;
add00535
LP
1119}
1120
58f6ab44 1121int seccomp_parse_syscall_filter(
898748d8
YW
1122 const char *name,
1123 int errno_num,
1124 Hashmap *filter,
13d92c63 1125 SeccompParseFlags flags,
898748d8
YW
1126 const char *unit,
1127 const char *filename,
1128 unsigned line) {
1129
1130 int r;
1131
1132 assert(name);
1133 assert(filter);
1134
1135 if (name[0] == '@') {
1136 const SyscallFilterSet *set;
1137 const char *i;
1138
1139 set = syscall_filter_set_find(name);
1140 if (!set) {
13d92c63 1141 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1142 return -EINVAL;
13d92c63
LP
1143
1144 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1145 "Unknown system call group, ignoring: %s", name);
1146 return 0;
898748d8
YW
1147 }
1148
1149 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1150 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1151 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1152 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1153 * about them. */
58f6ab44 1154 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1155 if (r < 0)
1156 return r;
1157 }
1158 } else {
1159 int id;
1160
1161 id = seccomp_syscall_resolve_name(name);
1162 if (id == __NR_SCMP_ERROR) {
13d92c63 1163 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1164 return -EINVAL;
13d92c63
LP
1165
1166 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1167 "Failed to parse system call, ignoring: %s", name);
1168 return 0;
898748d8
YW
1169 }
1170
1171 /* If we previously wanted to forbid a syscall and now
1172 * we want to allow it, then remove it from the list. */
6b000af4 1173 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
898748d8
YW
1174 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1175 if (r < 0)
851ee70a
LW
1176 switch (r) {
1177 case -ENOMEM:
1178 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1179 case -EEXIST:
9d7fe7c6
LW
1180 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1181 break;
851ee70a
LW
1182 default:
1183 return r;
1184 }
898748d8
YW
1185 } else
1186 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1187 }
1188
1189 return 0;
1190}
1191
add00535 1192int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1193 uint32_t arch;
add00535
LP
1194 int r;
1195
f1d34068 1196 if (DEBUG_LOGGING) {
add00535
LP
1197 _cleanup_free_ char *s = NULL;
1198
86c2a9f1 1199 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1200 log_debug("Restricting namespace to: %s.", strna(s));
1201 }
1202
1203 /* NOOP? */
d7a0f1f4 1204 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
add00535
LP
1205 return 0;
1206
469830d1
LP
1207 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1208 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
add00535 1209
469830d1
LP
1210 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1211
1212 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1213 if (r < 0)
1214 return r;
1215
1216 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1217 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1218 * altogether. */
1219 r = seccomp_rule_add_exact(
1220 seccomp,
1221 SCMP_ACT_ERRNO(EPERM),
1222 SCMP_SYS(setns),
1223 0);
1224 else
1225 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1226 * special invocation with a zero flags argument, right here. */
1227 r = seccomp_rule_add_exact(
1228 seccomp,
1229 SCMP_ACT_ERRNO(EPERM),
1230 SCMP_SYS(setns),
1231 1,
1232 SCMP_A1(SCMP_CMP_EQ, 0));
1233 if (r < 0) {
1234 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1235 continue;
1236 }
1237
077e8fc0 1238 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
469830d1
LP
1239 unsigned long f;
1240
1241 f = namespace_flag_map[i].flag;
d7a0f1f4 1242 if (FLAGS_SET(retain, f)) {
469830d1
LP
1243 log_debug("Permitting %s.", namespace_flag_map[i].name);
1244 continue;
1245 }
1246
1247 log_debug("Blocking %s.", namespace_flag_map[i].name);
1248
1249 r = seccomp_rule_add_exact(
1250 seccomp,
1251 SCMP_ACT_ERRNO(EPERM),
1252 SCMP_SYS(unshare),
1253 1,
1254 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1255 if (r < 0) {
1256 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1257 break;
1258 }
1259
511ceb1f
ZJS
1260 /* On s390/s390x the first two parameters to clone are switched */
1261 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1262 r = seccomp_rule_add_exact(
1263 seccomp,
1264 SCMP_ACT_ERRNO(EPERM),
1265 SCMP_SYS(clone),
1266 1,
1267 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1268 else
1269 r = seccomp_rule_add_exact(
1270 seccomp,
1271 SCMP_ACT_ERRNO(EPERM),
1272 SCMP_SYS(clone),
1273 1,
1274 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1275 if (r < 0) {
1276 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1277 break;
1278 }
1279
1280 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1281 r = seccomp_rule_add_exact(
1282 seccomp,
1283 SCMP_ACT_ERRNO(EPERM),
1284 SCMP_SYS(setns),
1285 1,
1286 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1287 if (r < 0) {
1288 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1289 break;
1290 }
1291 }
1292 }
1293 if (r < 0)
1294 continue;
1295
1296 r = seccomp_load(seccomp);
7bc5e0b1 1297 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1298 return r;
1299 if (r < 0)
1300 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1301 }
1302
1303 return 0;
1304}
1305
1306int seccomp_protect_sysctl(void) {
1307 uint32_t arch;
1308 int r;
1309
1310 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1311 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1312
1313 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1314
f9252236
AJ
1315 if (IN_SET(arch,
1316 SCMP_ARCH_AARCH64,
1317#ifdef SCMP_ARCH_RISCV64
1318 SCMP_ARCH_RISCV64,
1319#endif
1320 SCMP_ARCH_X32
1321 ))
2e64e8f4
ZJS
1322 /* No _sysctl syscall */
1323 continue;
1324
469830d1
LP
1325 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1326 if (r < 0)
1327 return r;
1328
1329 r = seccomp_rule_add_exact(
add00535
LP
1330 seccomp,
1331 SCMP_ACT_ERRNO(EPERM),
469830d1 1332 SCMP_SYS(_sysctl),
add00535 1333 0);
469830d1
LP
1334 if (r < 0) {
1335 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1336 continue;
1337 }
1338
1339 r = seccomp_load(seccomp);
7bc5e0b1 1340 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1341 return r;
1342 if (r < 0)
1343 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1344 }
1345
1346 return 0;
1347}
1348
620dbdd2
KK
1349int seccomp_protect_syslog(void) {
1350 uint32_t arch;
1351 int r;
1352
1353 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1354 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1355
1356 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1357 if (r < 0)
1358 return r;
1359
1360 r = seccomp_rule_add_exact(
1361 seccomp,
1362 SCMP_ACT_ERRNO(EPERM),
1363 SCMP_SYS(syslog),
1364 0);
1365
1366 if (r < 0) {
1367 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1368 continue;
1369 }
1370
1371 r = seccomp_load(seccomp);
1372 if (ERRNO_IS_SECCOMP_FATAL(r))
1373 return r;
1374 if (r < 0)
1375 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1376 }
1377
1378 return 0;
1379}
1380
6b000af4 1381int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1382 uint32_t arch;
1383 int r;
1384
1385 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1386 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1387 bool supported;
469830d1
LP
1388
1389 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1390
9606bc4b
LP
1391 switch (arch) {
1392
1393 case SCMP_ARCH_X86_64:
1394 case SCMP_ARCH_X32:
1395 case SCMP_ARCH_ARM:
1396 case SCMP_ARCH_AARCH64:
f5aeac14
JC
1397 case SCMP_ARCH_MIPSEL64N32:
1398 case SCMP_ARCH_MIPS64N32:
1399 case SCMP_ARCH_MIPSEL64:
1400 case SCMP_ARCH_MIPS64:
f9252236
AJ
1401#ifdef SCMP_ARCH_RISCV64
1402 case SCMP_ARCH_RISCV64:
1403#endif
9606bc4b
LP
1404 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1405 supported = true;
1406 break;
1407
9606bc4b
LP
1408 case SCMP_ARCH_S390:
1409 case SCMP_ARCH_S390X:
da1921a5 1410 case SCMP_ARCH_X86:
f5aeac14
JC
1411 case SCMP_ARCH_MIPSEL:
1412 case SCMP_ARCH_MIPS:
d5923e38
ZJS
1413 case SCMP_ARCH_PPC:
1414 case SCMP_ARCH_PPC64:
1415 case SCMP_ARCH_PPC64LE:
9606bc4b
LP
1416 default:
1417 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1418 * don't know */
1419 supported = false;
1420 break;
1421 }
1422
1423 if (!supported)
1424 continue;
1425
469830d1
LP
1426 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1427 if (r < 0)
1428 return r;
1429
6b000af4 1430 if (allow_list) {
077e8fc0 1431 int first = 0, last = 0;
469830d1
LP
1432 void *afp;
1433
6b000af4
LP
1434 /* If this is an allow list, we first block the address families that are out of
1435 * range and then everything that is not in the set. First, we find the lowest and
1436 * highest address family in the set. */
469830d1 1437
90e74a66 1438 SET_FOREACH(afp, address_families) {
077e8fc0 1439 int af = PTR_TO_INT(afp);
469830d1
LP
1440
1441 if (af <= 0 || af >= af_max())
1442 continue;
1443
1444 if (first == 0 || af < first)
1445 first = af;
1446
1447 if (last == 0 || af > last)
1448 last = af;
1449 }
1450
1451 assert((first == 0) == (last == 0));
1452
1453 if (first == 0) {
1454
1455 /* No entries in the valid range, block everything */
1456 r = seccomp_rule_add_exact(
1457 seccomp,
1458 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1459 SCMP_SYS(socket),
1460 0);
1461 if (r < 0) {
1462 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1463 continue;
1464 }
1465
1466 } else {
1467
1468 /* Block everything below the first entry */
1469 r = seccomp_rule_add_exact(
1470 seccomp,
1471 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1472 SCMP_SYS(socket),
1473 1,
1474 SCMP_A0(SCMP_CMP_LT, first));
1475 if (r < 0) {
1476 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1477 continue;
1478 }
1479
1480 /* Block everything above the last entry */
1481 r = seccomp_rule_add_exact(
1482 seccomp,
1483 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1484 SCMP_SYS(socket),
1485 1,
1486 SCMP_A0(SCMP_CMP_GT, last));
1487 if (r < 0) {
1488 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1489 continue;
1490 }
1491
1492 /* Block everything between the first and last entry */
077e8fc0 1493 for (int af = 1; af < af_max(); af++) {
469830d1
LP
1494
1495 if (set_contains(address_families, INT_TO_PTR(af)))
1496 continue;
1497
1498 r = seccomp_rule_add_exact(
1499 seccomp,
1500 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1501 SCMP_SYS(socket),
1502 1,
1503 SCMP_A0(SCMP_CMP_EQ, af));
1504 if (r < 0)
1505 break;
1506 }
469830d1
LP
1507 if (r < 0) {
1508 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1509 continue;
1510 }
1511 }
1512
1513 } else {
1514 void *af;
1515
6b000af4
LP
1516 /* If this is a deny list, then generate one rule for each address family that are
1517 * then combined in OR checks. */
469830d1 1518
90e74a66 1519 SET_FOREACH(af, address_families) {
469830d1
LP
1520 r = seccomp_rule_add_exact(
1521 seccomp,
1522 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1523 SCMP_SYS(socket),
1524 1,
1525 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1526 if (r < 0)
1527 break;
1528 }
469830d1
LP
1529 if (r < 0) {
1530 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1531 continue;
1532 }
1533 }
1534
1535 r = seccomp_load(seccomp);
7bc5e0b1 1536 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1537 return r;
1538 if (r < 0)
1539 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1540 }
1541
1542 return 0;
1543}
1544
1545int seccomp_restrict_realtime(void) {
1546 static const int permitted_policies[] = {
1547 SCHED_OTHER,
1548 SCHED_BATCH,
1549 SCHED_IDLE,
1550 };
1551
1552 int r, max_policy = 0;
1553 uint32_t arch;
1554 unsigned i;
1555
1556 /* Determine the highest policy constant we want to allow */
1557 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1558 if (permitted_policies[i] > max_policy)
1559 max_policy = permitted_policies[i];
1560
1561 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1562 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1563 int p;
1564
1565 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1566
1567 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1568 if (r < 0)
1569 return r;
1570
1571 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1572 * allow list. */
469830d1
LP
1573 for (p = 0; p < max_policy; p++) {
1574 bool good = false;
1575
6b000af4 1576 /* Check if this is in the allow list. */
469830d1
LP
1577 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1578 if (permitted_policies[i] == p) {
1579 good = true;
1580 break;
1581 }
1582
1583 if (good)
1584 continue;
1585
1586 /* Deny this policy */
1587 r = seccomp_rule_add_exact(
1588 seccomp,
1589 SCMP_ACT_ERRNO(EPERM),
1590 SCMP_SYS(sched_setscheduler),
1591 1,
1592 SCMP_A1(SCMP_CMP_EQ, p));
1593 if (r < 0) {
1594 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1595 continue;
1596 }
1597 }
1598
6b000af4
LP
1599 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1600 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1601 r = seccomp_rule_add_exact(
add00535
LP
1602 seccomp,
1603 SCMP_ACT_ERRNO(EPERM),
469830d1 1604 SCMP_SYS(sched_setscheduler),
add00535 1605 1,
469830d1
LP
1606 SCMP_A1(SCMP_CMP_GT, max_policy));
1607 if (r < 0) {
1608 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1609 continue;
1610 }
add00535 1611
469830d1 1612 r = seccomp_load(seccomp);
7bc5e0b1 1613 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1614 return r;
1615 if (r < 0)
1616 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1617 }
1618
1619 return 0;
1620}
1621
6dc66688
ZJS
1622static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1623 uint32_t arch,
1624 int nr,
14cb109d 1625 unsigned arg_cnt,
6dc66688
ZJS
1626 const struct scmp_arg_cmp arg) {
1627 int r;
1628
1629 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1630 if (r < 0) {
1631 _cleanup_free_ char *n = NULL;
1632
1633 n = seccomp_syscall_resolve_num_arch(arch, nr);
1634 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1635 strna(n),
1636 seccomp_arch_to_string(arch));
1637 }
1638
1639 return r;
1640}
1641
2a8d6e63 1642/* For known architectures, check that syscalls are indeed defined or not. */
f9252236 1643#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
2a8d6e63
ZJS
1644assert_cc(SCMP_SYS(shmget) > 0);
1645assert_cc(SCMP_SYS(shmat) > 0);
1646assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1647#endif
6dc66688 1648
469830d1
LP
1649int seccomp_memory_deny_write_execute(void) {
1650 uint32_t arch;
b069c2a3 1651 unsigned loaded = 0;
469830d1
LP
1652
1653 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1654 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1655 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1656
469830d1
LP
1657 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1658
8a50cf69
LP
1659 switch (arch) {
1660
bed4668d
CE
1661 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1662 * We ignore that here, which means there's still a way to get writable/executable
1663 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1664
8a50cf69 1665 case SCMP_ARCH_X86:
57311925 1666 case SCMP_ARCH_S390:
8a50cf69
LP
1667 filter_syscall = SCMP_SYS(mmap2);
1668 block_syscall = SCMP_SYS(mmap);
bed4668d 1669 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1670 break;
1671
63d00dfb 1672 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1673 case SCMP_ARCH_PPC64:
1674 case SCMP_ARCH_PPC64LE:
bed4668d 1675 case SCMP_ARCH_S390X:
2a8d6e63 1676 filter_syscall = SCMP_SYS(mmap);
bed4668d 1677 /* shmat multiplexed, see above */
8a50cf69
LP
1678 break;
1679
4278d1f5
ZJS
1680 case SCMP_ARCH_ARM:
1681 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1682 shmat_syscall = SCMP_SYS(shmat);
1683 break;
1684
8a50cf69
LP
1685 case SCMP_ARCH_X86_64:
1686 case SCMP_ARCH_X32:
79873bc8 1687 case SCMP_ARCH_AARCH64:
f9252236
AJ
1688#ifdef SCMP_ARCH_RISCV64
1689 case SCMP_ARCH_RISCV64:
1690#endif
1691 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
8a50cf69
LP
1692 shmat_syscall = SCMP_SYS(shmat);
1693 break;
1694
1695 /* Please add more definitions here, if you port systemd to other architectures! */
1696
f9252236 1697#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
8a50cf69
LP
1698#warning "Consider adding the right mmap() syscall definitions here!"
1699#endif
1700 }
1701
1702 /* Can't filter mmap() on this arch, then skip it */
1703 if (filter_syscall == 0)
1704 continue;
1705
469830d1
LP
1706 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1707 if (r < 0)
1708 return r;
1709
6dc66688
ZJS
1710 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1711 1,
1712 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1713 if (r < 0)
1714 continue;
8a50cf69
LP
1715
1716 if (block_syscall != 0) {
6dc66688
ZJS
1717 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1718 if (r < 0)
8a50cf69 1719 continue;
add00535 1720 }
a3be2849 1721
6dc66688
ZJS
1722 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1723 1,
b835eeb4
ZJS
1724 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1725 if (r < 0)
1726 continue;
1727
91691f1d 1728#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1729 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1730 1,
6dc66688
ZJS
1731 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1732 if (r < 0)
469830d1 1733 continue;
91691f1d 1734#endif
add00535 1735
67fb5f33 1736 if (shmat_syscall > 0) {
5ef3ed97 1737 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1738 1,
1739 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1740 if (r < 0)
8a50cf69 1741 continue;
469830d1
LP
1742 }
1743
1744 r = seccomp_load(seccomp);
7bc5e0b1 1745 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1 1746 return r;
add00535 1747 if (r < 0)
b069c2a3
ZJS
1748 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1749 seccomp_arch_to_string(arch));
903659e7 1750 loaded++;
469830d1 1751 }
add00535 1752
903659e7 1753 if (loaded == 0)
b069c2a3 1754 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1755
1756 return loaded;
469830d1
LP
1757}
1758
1759int seccomp_restrict_archs(Set *archs) {
1760 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
469830d1
LP
1761 void *id;
1762 int r;
1763
1764 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1765 * list.
1766 *
1767 * There are some qualifications. However the most important use is to stop processes from bypassing
1768 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1769 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1770
2428aaf8
AJ
1771 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1772 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1773 * to run a program with the restrictions applied. */
469830d1
LP
1774 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1775 if (!seccomp)
1776 return -ENOMEM;
1777
90e74a66 1778 SET_FOREACH(id, archs) {
469830d1 1779 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1780 if (r < 0 && r != -EEXIST)
1781 return r;
1782 }
1783
1784 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1785 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1786 * The important thing is that you can block the old 32-bit x86 syscalls.
1787 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1788
1789 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1790 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1791
1792 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1793 if (r < 0 && r != -EEXIST)
469830d1 1794 return r;
add00535
LP
1795 }
1796
469830d1
LP
1797 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1798 if (r < 0)
1799 return r;
add00535 1800
1c6af69b 1801 r = seccomp_load(seccomp);
7bc5e0b1 1802 if (ERRNO_IS_SECCOMP_FATAL(r))
1c6af69b
LP
1803 return r;
1804 if (r < 0)
1805 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1806
1807 return 0;
a3be2849 1808}
b16bd535 1809
de7fef4b
ZJS
1810int parse_syscall_archs(char **l, Set **ret_archs) {
1811 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
1812 char **s;
1813 int r;
1814
1815 assert(l);
de7fef4b 1816 assert(ret_archs);
b16bd535
YW
1817
1818 STRV_FOREACH(s, l) {
1819 uint32_t a;
1820
1821 r = seccomp_arch_from_string(*s, &a);
1822 if (r < 0)
1823 return -EINVAL;
1824
de7fef4b 1825 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
1826 if (r < 0)
1827 return -ENOMEM;
1828 }
1829
de7fef4b 1830 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
1831 return 0;
1832}
165a31c0 1833
8cfa775f 1834int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1835 const char *i;
1836 int r;
1837
1838 assert(set);
1839
1840 NULSTR_FOREACH(i, set->value) {
1841
1842 if (i[0] == '@') {
1843 const SyscallFilterSet *more;
1844
1845 more = syscall_filter_set_find(i);
1846 if (!more)
1847 return -ENXIO;
1848
165a31c0
LP
1849 r = seccomp_filter_set_add(filter, add, more);
1850 if (r < 0)
1851 return r;
1852 } else {
1853 int id;
1854
1855 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1856 if (id == __NR_SCMP_ERROR) {
1857 log_debug("Couldn't resolve system call, ignoring: %s", i);
1858 continue;
1859 }
165a31c0
LP
1860
1861 if (add) {
8cfa775f 1862 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1863 if (r < 0)
1864 return r;
1865 } else
8cfa775f 1866 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1867 }
1868 }
1869
1870 return 0;
1871}
78e864e5
TM
1872
1873int seccomp_lock_personality(unsigned long personality) {
72eafe71 1874 uint32_t arch;
78e864e5
TM
1875 int r;
1876
72eafe71
LP
1877 if (personality >= PERSONALITY_INVALID)
1878 return -EINVAL;
78e864e5 1879
72eafe71
LP
1880 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1881 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1882
72eafe71
LP
1883 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1884 if (r < 0)
1885 return r;
1886
1887 r = seccomp_rule_add_exact(
1888 seccomp,
1889 SCMP_ACT_ERRNO(EPERM),
1890 SCMP_SYS(personality),
1891 1,
1892 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1893 if (r < 0) {
1894 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1895 continue;
1896 }
72eafe71
LP
1897
1898 r = seccomp_load(seccomp);
7bc5e0b1 1899 if (ERRNO_IS_SECCOMP_FATAL(r))
72eafe71
LP
1900 return r;
1901 if (r < 0)
1902 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1903 }
1904
1905 return 0;
78e864e5 1906}
aecd5ac6
TM
1907
1908int seccomp_protect_hostname(void) {
1909 uint32_t arch;
1910 int r;
1911
1912 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1913 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1914
1915 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1916 if (r < 0)
1917 return r;
1918
1919 r = seccomp_rule_add_exact(
1920 seccomp,
1921 SCMP_ACT_ERRNO(EPERM),
1922 SCMP_SYS(sethostname),
1923 0);
9e6e543c
LP
1924 if (r < 0) {
1925 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1926 continue;
9e6e543c 1927 }
aecd5ac6
TM
1928
1929 r = seccomp_rule_add_exact(
1930 seccomp,
1931 SCMP_ACT_ERRNO(EPERM),
1932 SCMP_SYS(setdomainname),
1933 0);
9e6e543c
LP
1934 if (r < 0) {
1935 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1936 continue;
9e6e543c 1937 }
aecd5ac6
TM
1938
1939 r = seccomp_load(seccomp);
7bc5e0b1 1940 if (ERRNO_IS_SECCOMP_FATAL(r))
aecd5ac6
TM
1941 return r;
1942 if (r < 0)
1943 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1944 }
1945
1946 return 0;
1947}
3c27973b 1948
da4dc9a6
ZJS
1949static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1950 /* Checks the mode_t parameter of the following system calls:
1951 *
1952 * → chmod() + fchmod() + fchmodat()
1953 * → open() + creat() + openat()
1954 * → mkdir() + mkdirat()
1955 * → mknod() + mknodat()
1956 *
1957 * Returns error if *everything* failed, and 0 otherwise.
1958 */
6d95e7d9 1959 int r;
da4dc9a6
ZJS
1960 bool any = false;
1961
1962 r = seccomp_rule_add_exact(
1963 seccomp,
1964 SCMP_ACT_ERRNO(EPERM),
1965 SCMP_SYS(chmod),
1966 1,
1967 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1968 if (r < 0)
1969 log_debug_errno(r, "Failed to add filter for chmod: %m");
1970 else
1971 any = true;
1972
1973 r = seccomp_rule_add_exact(
1974 seccomp,
1975 SCMP_ACT_ERRNO(EPERM),
1976 SCMP_SYS(fchmod),
1977 1,
1978 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1979 if (r < 0)
1980 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1981 else
1982 any = true;
1983
1984 r = seccomp_rule_add_exact(
1985 seccomp,
1986 SCMP_ACT_ERRNO(EPERM),
1987 SCMP_SYS(fchmodat),
1988 1,
1989 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1990 if (r < 0)
1991 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1992 else
1993 any = true;
1994
1995 r = seccomp_rule_add_exact(
1996 seccomp,
1997 SCMP_ACT_ERRNO(EPERM),
1998 SCMP_SYS(mkdir),
1999 1,
2000 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2001 if (r < 0)
2002 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2003 else
2004 any = true;
2005
2006 r = seccomp_rule_add_exact(
2007 seccomp,
2008 SCMP_ACT_ERRNO(EPERM),
2009 SCMP_SYS(mkdirat),
2010 1,
2011 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2012 if (r < 0)
2013 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2014 else
2015 any = true;
2016
2017 r = seccomp_rule_add_exact(
2018 seccomp,
2019 SCMP_ACT_ERRNO(EPERM),
2020 SCMP_SYS(mknod),
2021 1,
2022 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2023 if (r < 0)
2024 log_debug_errno(r, "Failed to add filter for mknod: %m");
2025 else
2026 any = true;
2027
2028 r = seccomp_rule_add_exact(
2029 seccomp,
2030 SCMP_ACT_ERRNO(EPERM),
2031 SCMP_SYS(mknodat),
2032 1,
2033 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2034 if (r < 0)
2035 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2036 else
2037 any = true;
2038
2039#if SCMP_SYS(open) > 0
2040 r = seccomp_rule_add_exact(
2041 seccomp,
2042 SCMP_ACT_ERRNO(EPERM),
2043 SCMP_SYS(open),
2044 2,
2045 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2046 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2047 if (r < 0)
2048 log_debug_errno(r, "Failed to add filter for open: %m");
2049 else
2050 any = true;
2051#endif
2052
2053 r = seccomp_rule_add_exact(
2054 seccomp,
2055 SCMP_ACT_ERRNO(EPERM),
2056 SCMP_SYS(openat),
2057 2,
2058 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2059 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2060 if (r < 0)
2061 log_debug_errno(r, "Failed to add filter for openat: %m");
2062 else
2063 any = true;
2064
ecc04067
LP
2065#if defined(__SNR_openat2)
2066 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2067 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2068 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2069 * compatible with kernels that are not absolutely recent. */
2070 r = seccomp_rule_add_exact(
2071 seccomp,
2072 SCMP_ACT_ERRNO(EPERM),
2073 SCMP_SYS(openat2),
2074 0);
2075 if (r < 0)
2076 log_debug_errno(r, "Failed to add filter for openat2: %m");
2077 else
2078 any = true;
2079#endif
2080
da4dc9a6
ZJS
2081 r = seccomp_rule_add_exact(
2082 seccomp,
2083 SCMP_ACT_ERRNO(EPERM),
2084 SCMP_SYS(creat),
2085 1,
2086 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2087 if (r < 0)
2088 log_debug_errno(r, "Failed to add filter for creat: %m");
2089 else
2090 any = true;
2091
2092 return any ? 0 : r;
2093}
2094
3c27973b
LP
2095int seccomp_restrict_suid_sgid(void) {
2096 uint32_t arch;
da4dc9a6 2097 int r, k;
3c27973b
LP
2098
2099 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2100 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2101
2102 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2103 if (r < 0)
2104 return r;
2105
da4dc9a6
ZJS
2106 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2107 if (r < 0)
2108 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2109
da4dc9a6
ZJS
2110 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2111 if (k < 0)
2112 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2113
da4dc9a6 2114 if (r < 0 && k < 0)
3c27973b 2115 continue;
3c27973b
LP
2116
2117 r = seccomp_load(seccomp);
7bc5e0b1 2118 if (ERRNO_IS_SECCOMP_FATAL(r))
3c27973b
LP
2119 return r;
2120 if (r < 0)
2121 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2122 }
2123
2124 return 0;
2125}
915fb324
LP
2126
2127uint32_t scmp_act_kill_process(void) {
2128
2129 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2130 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2131 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2132 * for single-threaded apps does the right thing. */
2133
2134#ifdef SCMP_ACT_KILL_PROCESS
2135 if (seccomp_api_get() >= 3)
2136 return SCMP_ACT_KILL_PROCESS;
2137#endif
2138
2139 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2140}