]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
seccomp-util: add cacheflush() syscall to @default syscall set
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
57183d11 6#include <seccomp.h>
a8fbdf54 7#include <stddef.h>
469830d1 8#include <sys/mman.h>
d347d902 9#include <sys/prctl.h>
469830d1 10#include <sys/shm.h>
3c27973b 11#include <sys/stat.h>
57183d11 12
469830d1 13#include "af-list.h"
add00535 14#include "alloc-util.h"
44aaddad 15#include "env-util.h"
d8b4d14d 16#include "errno-list.h"
a8fbdf54 17#include "macro.h"
add00535 18#include "nsflags.h"
d8b4d14d 19#include "nulstr-util.h"
78e864e5 20#include "process-util.h"
cf0fbc49 21#include "seccomp-util.h"
b16bd535 22#include "set.h"
07630cea 23#include "string-util.h"
b16bd535 24#include "strv.h"
469830d1
LP
25
26const uint32_t seccomp_local_archs[] = {
27
6b000af4 28 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
29
30#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
31 SCMP_ARCH_X86,
32 SCMP_ARCH_X86_64,
f2d9751c
LP
33 SCMP_ARCH_X32, /* native */
34#elif defined(__x86_64__) && !defined(__ILP32__)
35 SCMP_ARCH_X86,
469830d1 36 SCMP_ARCH_X32,
f2d9751c
LP
37 SCMP_ARCH_X86_64, /* native */
38#elif defined(__i386__)
39 SCMP_ARCH_X86,
40#elif defined(__aarch64__)
469830d1 41 SCMP_ARCH_ARM,
f2d9751c
LP
42 SCMP_ARCH_AARCH64, /* native */
43#elif defined(__arm__)
44 SCMP_ARCH_ARM,
45#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPSEL,
47 SCMP_ARCH_MIPS, /* native */
48#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 49 SCMP_ARCH_MIPS,
f2d9751c
LP
50 SCMP_ARCH_MIPSEL, /* native */
51#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
52 SCMP_ARCH_MIPSEL,
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL64N32,
469830d1 55 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
56 SCMP_ARCH_MIPSEL64,
57 SCMP_ARCH_MIPS64, /* native */
58#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPS,
469830d1 60 SCMP_ARCH_MIPSEL,
f2d9751c
LP
61 SCMP_ARCH_MIPS64N32,
62 SCMP_ARCH_MIPSEL64N32,
63 SCMP_ARCH_MIPS64,
64 SCMP_ARCH_MIPSEL64, /* native */
65#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
469830d1 68 SCMP_ARCH_MIPSEL64,
f2d9751c 69 SCMP_ARCH_MIPS64,
469830d1 70 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
71 SCMP_ARCH_MIPS64N32, /* native */
72#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64,
76 SCMP_ARCH_MIPSEL64,
77 SCMP_ARCH_MIPS64N32,
78 SCMP_ARCH_MIPSEL64N32, /* native */
79#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 80 SCMP_ARCH_PPC,
469830d1 81 SCMP_ARCH_PPC64LE,
f2d9751c
LP
82 SCMP_ARCH_PPC64, /* native */
83#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
84 SCMP_ARCH_PPC,
85 SCMP_ARCH_PPC64,
86 SCMP_ARCH_PPC64LE, /* native */
87#elif defined(__powerpc__)
88 SCMP_ARCH_PPC,
f9252236
AJ
89#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
90 SCMP_ARCH_RISCV64,
f2d9751c
LP
91#elif defined(__s390x__)
92 SCMP_ARCH_S390,
93 SCMP_ARCH_S390X, /* native */
94#elif defined(__s390__)
469830d1 95 SCMP_ARCH_S390,
469830d1
LP
96#endif
97 (uint32_t) -1
98 };
57183d11
LP
99
100const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
101 /* Maintain order used in <seccomp.h>.
102 *
103 * Names used here should be the same as those used for ConditionArchitecture=,
104 * except for "subarchitectures" like x32. */
57183d11 105
aa34055f
ZJS
106 switch(c) {
107 case SCMP_ARCH_NATIVE:
57183d11 108 return "native";
aa34055f 109 case SCMP_ARCH_X86:
57183d11 110 return "x86";
aa34055f 111 case SCMP_ARCH_X86_64:
57183d11 112 return "x86-64";
aa34055f 113 case SCMP_ARCH_X32:
57183d11 114 return "x32";
aa34055f 115 case SCMP_ARCH_ARM:
57183d11 116 return "arm";
aa34055f
ZJS
117 case SCMP_ARCH_AARCH64:
118 return "arm64";
119 case SCMP_ARCH_MIPS:
120 return "mips";
121 case SCMP_ARCH_MIPS64:
122 return "mips64";
123 case SCMP_ARCH_MIPS64N32:
124 return "mips64-n32";
125 case SCMP_ARCH_MIPSEL:
126 return "mips-le";
127 case SCMP_ARCH_MIPSEL64:
128 return "mips64-le";
129 case SCMP_ARCH_MIPSEL64N32:
130 return "mips64-le-n32";
131 case SCMP_ARCH_PPC:
132 return "ppc";
133 case SCMP_ARCH_PPC64:
134 return "ppc64";
135 case SCMP_ARCH_PPC64LE:
136 return "ppc64-le";
f9252236
AJ
137#ifdef SCMP_ARCH_RISCV64
138 case SCMP_ARCH_RISCV64:
139 return "riscv64";
140#endif
aa34055f 141 case SCMP_ARCH_S390:
6abfd303 142 return "s390";
aa34055f 143 case SCMP_ARCH_S390X:
6abfd303 144 return "s390x";
aa34055f
ZJS
145 default:
146 return NULL;
147 }
57183d11
LP
148}
149
150int seccomp_arch_from_string(const char *n, uint32_t *ret) {
151 if (!n)
152 return -EINVAL;
153
154 assert(ret);
155
156 if (streq(n, "native"))
157 *ret = SCMP_ARCH_NATIVE;
158 else if (streq(n, "x86"))
159 *ret = SCMP_ARCH_X86;
160 else if (streq(n, "x86-64"))
161 *ret = SCMP_ARCH_X86_64;
162 else if (streq(n, "x32"))
163 *ret = SCMP_ARCH_X32;
164 else if (streq(n, "arm"))
165 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
166 else if (streq(n, "arm64"))
167 *ret = SCMP_ARCH_AARCH64;
168 else if (streq(n, "mips"))
169 *ret = SCMP_ARCH_MIPS;
170 else if (streq(n, "mips64"))
171 *ret = SCMP_ARCH_MIPS64;
172 else if (streq(n, "mips64-n32"))
173 *ret = SCMP_ARCH_MIPS64N32;
174 else if (streq(n, "mips-le"))
175 *ret = SCMP_ARCH_MIPSEL;
176 else if (streq(n, "mips64-le"))
177 *ret = SCMP_ARCH_MIPSEL64;
178 else if (streq(n, "mips64-le-n32"))
179 *ret = SCMP_ARCH_MIPSEL64N32;
180 else if (streq(n, "ppc"))
181 *ret = SCMP_ARCH_PPC;
182 else if (streq(n, "ppc64"))
183 *ret = SCMP_ARCH_PPC64;
184 else if (streq(n, "ppc64-le"))
185 *ret = SCMP_ARCH_PPC64LE;
f9252236
AJ
186#ifdef SCMP_ARCH_RISCV64
187 else if (streq(n, "riscv64"))
188 *ret = SCMP_ARCH_RISCV64;
189#endif
6abfd303
HB
190 else if (streq(n, "s390"))
191 *ret = SCMP_ARCH_S390;
192 else if (streq(n, "s390x"))
193 *ret = SCMP_ARCH_S390X;
57183d11
LP
194 else
195 return -EINVAL;
196
197 return 0;
198}
e9642be2 199
469830d1 200int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 201 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
202 int r;
203
469830d1
LP
204 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
205 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
206
207 seccomp = seccomp_init(default_action);
208 if (!seccomp)
209 return -ENOMEM;
210
469830d1
LP
211 if (arch != SCMP_ARCH_NATIVE &&
212 arch != seccomp_arch_native()) {
213
1b52793d 214 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 215 if (r < 0)
b4eaa6cc 216 return r;
469830d1 217
1b52793d 218 r = seccomp_arch_add(seccomp, arch);
469830d1 219 if (r < 0)
b4eaa6cc 220 return r;
469830d1
LP
221
222 assert(seccomp_arch_exist(seccomp, arch) >= 0);
223 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
224 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
225 } else {
226 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
227 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
228 }
229
230 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 231 if (r < 0)
b4eaa6cc 232 return r;
8d7b0c8f
LP
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
235 if (r < 0)
b4eaa6cc 236 return r;
8d7b0c8f 237
44aaddad
SD
238#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
239 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
240 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
241 if (r < 0)
242 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
243 }
244#endif
245
b4eaa6cc 246 *ret = TAKE_PTR(seccomp);
8d7b0c8f 247 return 0;
8d7b0c8f
LP
248}
249
d347d902 250static bool is_basic_seccomp_available(void) {
4d5bd50a 251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
252}
253
254static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
d347d902
FS
257}
258
83f12b27 259bool is_seccomp_available(void) {
83f12b27 260 static int cached_enabled = -1;
4d5bd50a 261
83f12b27 262 if (cached_enabled < 0)
4d5bd50a
LP
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
83f12b27
FS
267 return cached_enabled;
268}
269
8130926d 270const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 271 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 272 .name = "@default",
d5efc18b 273 .help = "System calls that are always permitted",
40eb6a80 274 .value =
8e24b1d2 275 "cacheflush\0"
40eb6a80 276 "clock_getres\0"
6ca67710 277 "clock_getres_time64\0"
40eb6a80 278 "clock_gettime\0"
6ca67710 279 "clock_gettime64\0"
40eb6a80 280 "clock_nanosleep\0"
6ca67710 281 "clock_nanosleep_time64\0"
40eb6a80
ZJS
282 "execve\0"
283 "exit\0"
284 "exit_group\0"
e41b0f42 285 "futex\0"
6ca67710 286 "futex_time64\0"
e41b0f42
LP
287 "get_robust_list\0"
288 "get_thread_area\0"
09d3020b
DH
289 "getegid\0"
290 "getegid32\0"
291 "geteuid\0"
292 "geteuid32\0"
293 "getgid\0"
294 "getgid32\0"
295 "getgroups\0"
296 "getgroups32\0"
297 "getpgid\0"
298 "getpgrp\0"
299 "getpid\0"
300 "getppid\0"
301 "getresgid\0"
302 "getresgid32\0"
303 "getresuid\0"
304 "getresuid32\0"
40eb6a80 305 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
306 "getsid\0"
307 "gettid\0"
40eb6a80 308 "gettimeofday\0"
09d3020b
DH
309 "getuid\0"
310 "getuid32\0"
e41b0f42 311 "membarrier\0"
40eb6a80
ZJS
312 "nanosleep\0"
313 "pause\0"
4c3a9176 314 "prlimit64\0"
e41b0f42 315 "restart_syscall\0"
6fee3be0 316 "rseq\0"
40eb6a80 317 "rt_sigreturn\0"
8f44de08 318 "sched_yield\0"
e41b0f42
LP
319 "set_robust_list\0"
320 "set_thread_area\0"
321 "set_tid_address\0"
ce5faeac 322 "set_tls\0"
40eb6a80
ZJS
323 "sigreturn\0"
324 "time\0"
4c3a9176 325 "ugetrlimit\0"
40eb6a80 326 },
44898c53
LP
327 [SYSCALL_FILTER_SET_AIO] = {
328 .name = "@aio",
329 .help = "Asynchronous IO",
330 .value =
331 "io_cancel\0"
332 "io_destroy\0"
333 "io_getevents\0"
a05cfe23 334 "io_pgetevents\0"
6ca67710 335 "io_pgetevents_time64\0"
44898c53
LP
336 "io_setup\0"
337 "io_submit\0"
9e486265
LP
338 "io_uring_enter\0"
339 "io_uring_register\0"
340 "io_uring_setup\0"
44898c53 341 },
133ddbbe 342 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 343 .name = "@basic-io",
d5efc18b 344 .help = "Basic IO",
133ddbbe 345 .value =
648a0ed0 346 "_llseek\0"
133ddbbe 347 "close\0"
648a0ed0 348 "dup\0"
133ddbbe
LP
349 "dup2\0"
350 "dup3\0"
133ddbbe
LP
351 "lseek\0"
352 "pread64\0"
353 "preadv\0"
44898c53 354 "preadv2\0"
133ddbbe
LP
355 "pwrite64\0"
356 "pwritev\0"
44898c53 357 "pwritev2\0"
133ddbbe
LP
358 "read\0"
359 "readv\0"
360 "write\0"
361 "writev\0"
362 },
44898c53
LP
363 [SYSCALL_FILTER_SET_CHOWN] = {
364 .name = "@chown",
365 .help = "Change ownership of files and directories",
366 .value =
367 "chown\0"
368 "chown32\0"
369 "fchown\0"
370 "fchown32\0"
371 "fchownat\0"
372 "lchown\0"
373 "lchown32\0"
374 },
8130926d 375 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 376 .name = "@clock",
d5efc18b 377 .help = "Change the system time",
201c1cc2
TM
378 .value =
379 "adjtimex\0"
1f9ac68b 380 "clock_adjtime\0"
6ca67710 381 "clock_adjtime64\0"
1f9ac68b 382 "clock_settime\0"
6ca67710 383 "clock_settime64\0"
201c1cc2 384 "settimeofday\0"
1f9ac68b 385 "stime\0"
8130926d
LP
386 },
387 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 388 .name = "@cpu-emulation",
d5efc18b 389 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
390 .value =
391 "modify_ldt\0"
392 "subpage_prot\0"
393 "switch_endian\0"
394 "vm86\0"
395 "vm86old\0"
8130926d
LP
396 },
397 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 398 .name = "@debug",
d5efc18b 399 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
400 .value =
401 "lookup_dcookie\0"
402 "perf_event_open\0"
8270e3d8 403 "pidfd_getfd\0"
1f9ac68b
LP
404 "ptrace\0"
405 "rtas\0"
6da432fd 406#if defined __s390__ || defined __s390x__
1f9ac68b 407 "s390_runtime_instr\0"
8130926d 408#endif
1f9ac68b 409 "sys_debug_setcontext\0"
8130926d 410 },
1a1b13c9
LP
411 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
412 .name = "@file-system",
413 .help = "File system operations",
414 .value =
415 "access\0"
416 "chdir\0"
417 "chmod\0"
418 "close\0"
419 "creat\0"
420 "faccessat\0"
bcf08acb 421 "faccessat2\0"
1a1b13c9
LP
422 "fallocate\0"
423 "fchdir\0"
424 "fchmod\0"
425 "fchmodat\0"
1a1b13c9 426 "fcntl\0"
ceaa6aa7 427 "fcntl64\0"
1a1b13c9
LP
428 "fgetxattr\0"
429 "flistxattr\0"
ceaa6aa7 430 "fremovexattr\0"
1a1b13c9 431 "fsetxattr\0"
1a1b13c9 432 "fstat\0"
ceaa6aa7 433 "fstat64\0"
1a1b13c9 434 "fstatat64\0"
1a1b13c9 435 "fstatfs\0"
ceaa6aa7 436 "fstatfs64\0"
1a1b13c9 437 "ftruncate\0"
ceaa6aa7 438 "ftruncate64\0"
1a1b13c9
LP
439 "futimesat\0"
440 "getcwd\0"
1a1b13c9 441 "getdents\0"
ceaa6aa7 442 "getdents64\0"
1a1b13c9
LP
443 "getxattr\0"
444 "inotify_add_watch\0"
ceaa6aa7 445 "inotify_init\0"
1a1b13c9
LP
446 "inotify_init1\0"
447 "inotify_rm_watch\0"
448 "lgetxattr\0"
449 "link\0"
450 "linkat\0"
451 "listxattr\0"
452 "llistxattr\0"
453 "lremovexattr\0"
454 "lsetxattr\0"
1a1b13c9 455 "lstat\0"
ceaa6aa7 456 "lstat64\0"
1a1b13c9
LP
457 "mkdir\0"
458 "mkdirat\0"
459 "mknod\0"
460 "mknodat\0"
1a1b13c9 461 "mmap\0"
ceaa6aa7 462 "mmap2\0"
7961116e 463 "munmap\0"
1a1b13c9 464 "newfstatat\0"
ceaa6aa7
LP
465 "oldfstat\0"
466 "oldlstat\0"
467 "oldstat\0"
1a1b13c9
LP
468 "open\0"
469 "openat\0"
8270e3d8 470 "openat2\0"
1a1b13c9
LP
471 "readlink\0"
472 "readlinkat\0"
473 "removexattr\0"
474 "rename\0"
1a1b13c9 475 "renameat\0"
ceaa6aa7 476 "renameat2\0"
1a1b13c9
LP
477 "rmdir\0"
478 "setxattr\0"
1a1b13c9 479 "stat\0"
ceaa6aa7 480 "stat64\0"
1a1b13c9 481 "statfs\0"
ceaa6aa7 482 "statfs64\0"
a4135a74 483 "statx\0"
1a1b13c9
LP
484 "symlink\0"
485 "symlinkat\0"
1a1b13c9 486 "truncate\0"
ceaa6aa7 487 "truncate64\0"
1a1b13c9
LP
488 "unlink\0"
489 "unlinkat\0"
ceaa6aa7 490 "utime\0"
1a1b13c9 491 "utimensat\0"
6ca67710 492 "utimensat_time64\0"
1a1b13c9
LP
493 "utimes\0"
494 },
8130926d 495 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 496 .name = "@io-event",
d5efc18b 497 .help = "Event loop system calls",
201c1cc2
TM
498 .value =
499 "_newselect\0"
201c1cc2 500 "epoll_create\0"
215728ff 501 "epoll_create1\0"
201c1cc2
TM
502 "epoll_ctl\0"
503 "epoll_ctl_old\0"
504 "epoll_pwait\0"
505 "epoll_wait\0"
506 "epoll_wait_old\0"
201c1cc2 507 "eventfd\0"
215728ff 508 "eventfd2\0"
201c1cc2
TM
509 "poll\0"
510 "ppoll\0"
6ca67710 511 "ppoll_time64\0"
201c1cc2 512 "pselect6\0"
6ca67710 513 "pselect6_time64\0"
201c1cc2 514 "select\0"
8130926d
LP
515 },
516 [SYSCALL_FILTER_SET_IPC] = {
8130926d 517 .name = "@ipc",
d5efc18b
ZJS
518 .help = "SysV IPC, POSIX Message Queues or other IPC",
519 .value =
520 "ipc\0"
cd5bfd7e 521 "memfd_create\0"
201c1cc2
TM
522 "mq_getsetattr\0"
523 "mq_notify\0"
524 "mq_open\0"
525 "mq_timedreceive\0"
6ca67710 526 "mq_timedreceive_time64\0"
201c1cc2 527 "mq_timedsend\0"
6ca67710 528 "mq_timedsend_time64\0"
201c1cc2
TM
529 "mq_unlink\0"
530 "msgctl\0"
531 "msgget\0"
532 "msgrcv\0"
533 "msgsnd\0"
cd5bfd7e 534 "pipe\0"
215728ff 535 "pipe2\0"
201c1cc2
TM
536 "process_vm_readv\0"
537 "process_vm_writev\0"
538 "semctl\0"
539 "semget\0"
540 "semop\0"
541 "semtimedop\0"
6ca67710 542 "semtimedop_time64\0"
201c1cc2
TM
543 "shmat\0"
544 "shmctl\0"
545 "shmdt\0"
546 "shmget\0"
8130926d
LP
547 },
548 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 549 .name = "@keyring",
d5efc18b 550 .help = "Kernel keyring access",
1f9ac68b
LP
551 .value =
552 "add_key\0"
553 "keyctl\0"
554 "request_key\0"
8130926d 555 },
cd0ddf6f
LP
556 [SYSCALL_FILTER_SET_MEMLOCK] = {
557 .name = "@memlock",
558 .help = "Memory locking control",
559 .value =
560 "mlock\0"
561 "mlock2\0"
562 "mlockall\0"
563 "munlock\0"
564 "munlockall\0"
565 },
8130926d 566 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 567 .name = "@module",
d5efc18b 568 .help = "Loading and unloading of kernel modules",
201c1cc2 569 .value =
201c1cc2
TM
570 "delete_module\0"
571 "finit_module\0"
572 "init_module\0"
8130926d
LP
573 },
574 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 575 .name = "@mount",
d5efc18b 576 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
577 .value =
578 "chroot\0"
9e486265
LP
579 "fsconfig\0"
580 "fsmount\0"
581 "fsopen\0"
582 "fspick\0"
201c1cc2 583 "mount\0"
9e486265
LP
584 "move_mount\0"
585 "open_tree\0"
201c1cc2 586 "pivot_root\0"
201c1cc2 587 "umount\0"
215728ff 588 "umount2\0"
8130926d
LP
589 },
590 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 591 .name = "@network-io",
d5efc18b 592 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 593 .value =
201c1cc2 594 "accept\0"
215728ff 595 "accept4\0"
201c1cc2
TM
596 "bind\0"
597 "connect\0"
598 "getpeername\0"
599 "getsockname\0"
600 "getsockopt\0"
601 "listen\0"
602 "recv\0"
603 "recvfrom\0"
604 "recvmmsg\0"
6ca67710 605 "recvmmsg_time64\0"
201c1cc2
TM
606 "recvmsg\0"
607 "send\0"
608 "sendmmsg\0"
609 "sendmsg\0"
610 "sendto\0"
611 "setsockopt\0"
612 "shutdown\0"
613 "socket\0"
614 "socketcall\0"
615 "socketpair\0"
8130926d
LP
616 },
617 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 618 /* some unknown even to libseccomp */
8130926d 619 .name = "@obsolete",
d5efc18b 620 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
621 .value =
622 "_sysctl\0"
623 "afs_syscall\0"
802fa07a 624 "bdflush\0"
201c1cc2 625 "break\0"
1f9ac68b 626 "create_module\0"
201c1cc2
TM
627 "ftime\0"
628 "get_kernel_syms\0"
201c1cc2
TM
629 "getpmsg\0"
630 "gtty\0"
7e0c3b8f 631 "idle\0"
201c1cc2 632 "lock\0"
201c1cc2 633 "mpx\0"
201c1cc2
TM
634 "prof\0"
635 "profil\0"
201c1cc2
TM
636 "putpmsg\0"
637 "query_module\0"
201c1cc2
TM
638 "security\0"
639 "sgetmask\0"
640 "ssetmask\0"
641 "stty\0"
1f9ac68b 642 "sysfs\0"
201c1cc2
TM
643 "tuxcall\0"
644 "ulimit\0"
645 "uselib\0"
1f9ac68b 646 "ustat\0"
201c1cc2 647 "vserver\0"
8130926d 648 },
9493b168
ZJS
649 [SYSCALL_FILTER_SET_PKEY] = {
650 .name = "@pkey",
651 .help = "System calls used for memory protection keys",
652 .value =
653 "pkey_alloc\0"
654 "pkey_free\0"
655 "pkey_mprotect\0"
656 },
8130926d 657 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 658 .name = "@privileged",
d5efc18b 659 .help = "All system calls which need super-user capabilities",
201c1cc2 660 .value =
44898c53 661 "@chown\0"
201c1cc2
TM
662 "@clock\0"
663 "@module\0"
664 "@raw-io\0"
af0f047b
LP
665 "@reboot\0"
666 "@swap\0"
215728ff 667 "_sysctl\0"
201c1cc2 668 "acct\0"
201c1cc2 669 "bpf\0"
1f9ac68b 670 "capset\0"
201c1cc2 671 "chroot\0"
a05cfe23 672 "fanotify_init\0"
9e486265 673 "fanotify_mark\0"
201c1cc2 674 "nfsservctl\0"
a05cfe23 675 "open_by_handle_at\0"
201c1cc2
TM
676 "pivot_root\0"
677 "quotactl\0"
201c1cc2 678 "setdomainname\0"
201c1cc2 679 "setfsuid\0"
215728ff 680 "setfsuid32\0"
201c1cc2 681 "setgroups\0"
215728ff 682 "setgroups32\0"
201c1cc2 683 "sethostname\0"
201c1cc2 684 "setresuid\0"
215728ff 685 "setresuid32\0"
201c1cc2 686 "setreuid\0"
215728ff 687 "setreuid32\0"
e05ee49b 688 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 689 "setuid32\0"
201c1cc2 690 "vhangup\0"
8130926d
LP
691 },
692 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 693 .name = "@process",
d5efc18b 694 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
695 .value =
696 "arch_prctl\0"
09d3020b 697 "capget\0" /* Able to query arbitrary processes */
201c1cc2 698 "clone\0"
9e486265 699 "clone3\0"
201c1cc2
TM
700 "execveat\0"
701 "fork\0"
b887d2eb 702 "getrusage\0"
201c1cc2 703 "kill\0"
9e486265 704 "pidfd_open\0"
46fcf95d 705 "pidfd_send_signal\0"
201c1cc2 706 "prctl\0"
b887d2eb
LP
707 "rt_sigqueueinfo\0"
708 "rt_tgsigqueueinfo\0"
201c1cc2 709 "setns\0"
a9518dc3 710 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 711 "tgkill\0"
b887d2eb 712 "times\0"
201c1cc2
TM
713 "tkill\0"
714 "unshare\0"
715 "vfork\0"
b887d2eb
LP
716 "wait4\0"
717 "waitid\0"
718 "waitpid\0"
8130926d
LP
719 },
720 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 721 .name = "@raw-io",
d5efc18b 722 .help = "Raw I/O port access",
201c1cc2
TM
723 .value =
724 "ioperm\0"
725 "iopl\0"
1f9ac68b 726 "pciconfig_iobase\0"
201c1cc2
TM
727 "pciconfig_read\0"
728 "pciconfig_write\0"
6da432fd 729#if defined __s390__ || defined __s390x__
201c1cc2
TM
730 "s390_pci_mmio_read\0"
731 "s390_pci_mmio_write\0"
8130926d
LP
732#endif
733 },
bd2ab3f4
LP
734 [SYSCALL_FILTER_SET_REBOOT] = {
735 .name = "@reboot",
736 .help = "Reboot and reboot preparation/kexec",
737 .value =
bd2ab3f4 738 "kexec_file_load\0"
e59608fa 739 "kexec_load\0"
bd2ab3f4
LP
740 "reboot\0"
741 },
133ddbbe 742 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 743 .name = "@resources",
58a8f68b 744 .help = "Alter resource settings",
133ddbbe 745 .value =
0963c053
LP
746 "ioprio_set\0"
747 "mbind\0"
748 "migrate_pages\0"
749 "move_pages\0"
750 "nice\0"
0963c053
LP
751 "sched_setaffinity\0"
752 "sched_setattr\0"
133ddbbe
LP
753 "sched_setparam\0"
754 "sched_setscheduler\0"
0963c053 755 "set_mempolicy\0"
133ddbbe
LP
756 "setpriority\0"
757 "setrlimit\0"
133ddbbe 758 },
6eaaeee9
LP
759 [SYSCALL_FILTER_SET_SETUID] = {
760 .name = "@setuid",
761 .help = "Operations for changing user/group credentials",
762 .value =
6eaaeee9 763 "setgid\0"
215728ff 764 "setgid32\0"
6eaaeee9 765 "setgroups\0"
215728ff 766 "setgroups32\0"
6eaaeee9 767 "setregid\0"
215728ff 768 "setregid32\0"
6eaaeee9 769 "setresgid\0"
215728ff 770 "setresgid32\0"
6eaaeee9 771 "setresuid\0"
215728ff 772 "setresuid32\0"
6eaaeee9 773 "setreuid\0"
215728ff 774 "setreuid32\0"
6eaaeee9 775 "setuid\0"
215728ff 776 "setuid32\0"
6eaaeee9 777 },
cd0ddf6f
LP
778 [SYSCALL_FILTER_SET_SIGNAL] = {
779 .name = "@signal",
780 .help = "Process signal handling",
781 .value =
782 "rt_sigaction\0"
783 "rt_sigpending\0"
784 "rt_sigprocmask\0"
785 "rt_sigsuspend\0"
786 "rt_sigtimedwait\0"
6ca67710 787 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
788 "sigaction\0"
789 "sigaltstack\0"
790 "signal\0"
791 "signalfd\0"
792 "signalfd4\0"
793 "sigpending\0"
794 "sigprocmask\0"
795 "sigsuspend\0"
796 },
bd2ab3f4
LP
797 [SYSCALL_FILTER_SET_SWAP] = {
798 .name = "@swap",
799 .help = "Enable/disable swap devices",
800 .value =
801 "swapoff\0"
802 "swapon\0"
803 },
44898c53
LP
804 [SYSCALL_FILTER_SET_SYNC] = {
805 .name = "@sync",
806 .help = "Synchronize files and memory to storage",
807 .value =
808 "fdatasync\0"
809 "fsync\0"
810 "msync\0"
811 "sync\0"
812 "sync_file_range\0"
a8fb09f5 813 "sync_file_range2\0"
44898c53
LP
814 "syncfs\0"
815 },
70526841
LP
816 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
817 .name = "@system-service",
818 .help = "General system service operations",
819 .value =
820 "@aio\0"
821 "@basic-io\0"
822 "@chown\0"
823 "@default\0"
824 "@file-system\0"
825 "@io-event\0"
826 "@ipc\0"
827 "@keyring\0"
828 "@memlock\0"
829 "@network-io\0"
830 "@process\0"
831 "@resources\0"
832 "@setuid\0"
833 "@signal\0"
834 "@sync\0"
835 "@timer\0"
836 "brk\0"
837 "capget\0"
838 "capset\0"
839 "copy_file_range\0"
840 "fadvise64\0"
841 "fadvise64_64\0"
842 "flock\0"
843 "get_mempolicy\0"
844 "getcpu\0"
845 "getpriority\0"
846 "getrandom\0"
847 "ioctl\0"
848 "ioprio_get\0"
849 "kcmp\0"
850 "madvise\0"
70526841
LP
851 "mprotect\0"
852 "mremap\0"
853 "name_to_handle_at\0"
854 "oldolduname\0"
855 "olduname\0"
856 "personality\0"
857 "readahead\0"
858 "readdir\0"
859 "remap_file_pages\0"
860 "sched_get_priority_max\0"
861 "sched_get_priority_min\0"
862 "sched_getaffinity\0"
863 "sched_getattr\0"
864 "sched_getparam\0"
865 "sched_getscheduler\0"
866 "sched_rr_get_interval\0"
6ca67710 867 "sched_rr_get_interval_time64\0"
70526841
LP
868 "sched_yield\0"
869 "sendfile\0"
870 "sendfile64\0"
871 "setfsgid\0"
872 "setfsgid32\0"
873 "setfsuid\0"
874 "setfsuid32\0"
875 "setpgid\0"
876 "setsid\0"
877 "splice\0"
878 "sysinfo\0"
879 "tee\0"
880 "umask\0"
881 "uname\0"
882 "userfaultfd\0"
883 "vmsplice\0"
884 },
cd0ddf6f
LP
885 [SYSCALL_FILTER_SET_TIMER] = {
886 .name = "@timer",
887 .help = "Schedule operations by time",
888 .value =
889 "alarm\0"
890 "getitimer\0"
891 "setitimer\0"
892 "timer_create\0"
893 "timer_delete\0"
894 "timer_getoverrun\0"
895 "timer_gettime\0"
6ca67710 896 "timer_gettime64\0"
cd0ddf6f 897 "timer_settime\0"
6ca67710 898 "timer_settime64\0"
cd0ddf6f
LP
899 "timerfd_create\0"
900 "timerfd_gettime\0"
6ca67710 901 "timerfd_gettime64\0"
cd0ddf6f 902 "timerfd_settime\0"
6ca67710 903 "timerfd_settime64\0"
cd0ddf6f
LP
904 "times\0"
905 },
95aac012
ZJS
906 [SYSCALL_FILTER_SET_KNOWN] = {
907 .name = "@known",
908 .help = "All known syscalls declared in the kernel",
909 .value =
910#include "syscall-list.h"
911 },
201c1cc2 912};
8130926d
LP
913
914const SyscallFilterSet *syscall_filter_set_find(const char *name) {
8130926d
LP
915 if (isempty(name) || name[0] != '@')
916 return NULL;
917
077e8fc0 918 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
8130926d
LP
919 if (streq(syscall_filter_sets[i].name, name))
920 return syscall_filter_sets + i;
921
922 return NULL;
923}
924
000c0520
ZJS
925static int add_syscall_filter_set(
926 scmp_filter_ctx seccomp,
927 const SyscallFilterSet *set,
928 uint32_t action,
929 char **exclude,
930 bool log_missing,
931 char ***added);
932
933int seccomp_add_syscall_filter_item(
934 scmp_filter_ctx *seccomp,
935 const char *name,
936 uint32_t action,
937 char **exclude,
938 bool log_missing,
939 char ***added) {
69b1b241
LP
940
941 assert(seccomp);
942 assert(name);
943
960e4569
LP
944 if (strv_contains(exclude, name))
945 return 0;
946
000c0520
ZJS
947 /* Any syscalls that are handled are added to the *added strv. The pointer
948 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
949
69b1b241
LP
950 if (name[0] == '@') {
951 const SyscallFilterSet *other;
952
953 other = syscall_filter_set_find(name);
baaa35ad
ZJS
954 if (!other)
955 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
956 "Filter set %s is not known!",
957 name);
69b1b241 958
000c0520 959 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
b54f36c6 960
69b1b241 961 } else {
b54f36c6 962 int id, r;
69b1b241
LP
963
964 id = seccomp_syscall_resolve_name(name);
cff7bff8 965 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
966 if (log_missing)
967 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 968 return 0;
cff7bff8 969 }
69b1b241
LP
970
971 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 972 if (r < 0) {
69b1b241 973 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
974 bool ignore = r == -EDOM;
975
976 if (!ignore || log_missing)
977 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
978 name, id, ignore ? ", ignoring" : "");
979 if (!ignore)
980 return r;
b54f36c6 981 }
69b1b241 982
000c0520
ZJS
983 if (added) {
984 r = strv_extend(added, name);
985 if (r < 0)
986 return r;
987 }
988
b54f36c6
ZJS
989 return 0;
990 }
69b1b241
LP
991}
992
000c0520 993static int add_syscall_filter_set(
469830d1 994 scmp_filter_ctx seccomp,
469830d1 995 const SyscallFilterSet *set,
960e4569 996 uint32_t action,
b54f36c6 997 char **exclude,
000c0520
ZJS
998 bool log_missing,
999 char ***added) {
469830d1 1000
8130926d
LP
1001 const char *sys;
1002 int r;
1003
000c0520
ZJS
1004 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1005
8130926d
LP
1006 assert(seccomp);
1007 assert(set);
1008
1009 NULSTR_FOREACH(sys, set->value) {
000c0520 1010 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
69b1b241
LP
1011 if (r < 0)
1012 return r;
469830d1
LP
1013 }
1014
1015 return 0;
1016}
1017
b54f36c6 1018int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
1019 uint32_t arch;
1020 int r;
1021
1022 assert(set);
1023
1024 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 1025 * each local arch. */
469830d1
LP
1026
1027 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1028 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1029
1030 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1031
1032 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
1033 if (r < 0)
1034 return r;
469830d1 1035
000c0520 1036 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
7e86bd73
ZJS
1037 if (r < 0)
1038 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
1039
1040 r = seccomp_load(seccomp);
7bc5e0b1 1041 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1042 return r;
1043 if (r < 0)
1044 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
1045 }
1046
1047 return 0;
1048}
a3be2849 1049
b54f36c6 1050int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 1051 uint32_t arch;
a3be2849
LP
1052 int r;
1053
469830d1
LP
1054 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1055 * SyscallFilterSet* table. */
a3be2849 1056
8cfa775f 1057 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 1058 return 0;
a3be2849 1059
469830d1
LP
1060 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1061 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b54f36c6 1062 void *syscall_id, *val;
a3be2849 1063
469830d1 1064 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1065
469830d1
LP
1066 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1067 if (r < 0)
1068 return r;
a3be2849 1069
90e74a66 1070 HASHMAP_FOREACH_KEY(val, syscall_id, set) {
8cfa775f 1071 uint32_t a = action;
b54f36c6
ZJS
1072 int id = PTR_TO_INT(syscall_id) - 1;
1073 int error = PTR_TO_INT(val);
8cfa775f 1074
005bfaf1
TM
1075 if (error == SECCOMP_ERROR_NUMBER_KILL)
1076 a = scmp_act_kill_process();
9df2cdd8
TM
1077#ifdef SCMP_ACT_LOG
1078 else if (action == SCMP_ACT_LOG)
1079 a = SCMP_ACT_LOG;
1080#endif
005bfaf1 1081 else if (action != SCMP_ACT_ALLOW && error >= 0)
b54f36c6 1082 a = SCMP_ACT_ERRNO(error);
8cfa775f 1083
b54f36c6 1084 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
1085 if (r < 0) {
1086 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1087 _cleanup_free_ char *n = NULL;
7e86bd73 1088 bool ignore;
469830d1 1089
b54f36c6 1090 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1091 ignore = r == -EDOM;
1092 if (!ignore || log_missing)
1093 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1094 strna(n), id, ignore ? ", ignoring" : "");
1095 if (!ignore)
1096 return r;
469830d1
LP
1097 }
1098 }
1099
1100 r = seccomp_load(seccomp);
7bc5e0b1 1101 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1102 return r;
1103 if (r < 0)
1104 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1105 }
1106
1107 return 0;
add00535
LP
1108}
1109
58f6ab44 1110int seccomp_parse_syscall_filter(
898748d8
YW
1111 const char *name,
1112 int errno_num,
1113 Hashmap *filter,
13d92c63 1114 SeccompParseFlags flags,
898748d8
YW
1115 const char *unit,
1116 const char *filename,
1117 unsigned line) {
1118
1119 int r;
1120
1121 assert(name);
1122 assert(filter);
1123
1124 if (name[0] == '@') {
1125 const SyscallFilterSet *set;
1126 const char *i;
1127
1128 set = syscall_filter_set_find(name);
1129 if (!set) {
13d92c63 1130 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1131 return -EINVAL;
13d92c63
LP
1132
1133 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1134 "Unknown system call group, ignoring: %s", name);
1135 return 0;
898748d8
YW
1136 }
1137
1138 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1139 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1140 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1141 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1142 * about them. */
58f6ab44 1143 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1144 if (r < 0)
1145 return r;
1146 }
1147 } else {
1148 int id;
1149
1150 id = seccomp_syscall_resolve_name(name);
1151 if (id == __NR_SCMP_ERROR) {
13d92c63 1152 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1153 return -EINVAL;
13d92c63
LP
1154
1155 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1156 "Failed to parse system call, ignoring: %s", name);
1157 return 0;
898748d8
YW
1158 }
1159
1160 /* If we previously wanted to forbid a syscall and now
1161 * we want to allow it, then remove it from the list. */
6b000af4 1162 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
898748d8
YW
1163 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1164 if (r < 0)
851ee70a
LW
1165 switch (r) {
1166 case -ENOMEM:
1167 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1168 case -EEXIST:
9d7fe7c6
LW
1169 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1170 break;
851ee70a
LW
1171 default:
1172 return r;
1173 }
898748d8
YW
1174 } else
1175 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1176 }
1177
1178 return 0;
1179}
1180
add00535 1181int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1182 uint32_t arch;
add00535
LP
1183 int r;
1184
f1d34068 1185 if (DEBUG_LOGGING) {
add00535
LP
1186 _cleanup_free_ char *s = NULL;
1187
86c2a9f1 1188 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1189 log_debug("Restricting namespace to: %s.", strna(s));
1190 }
1191
1192 /* NOOP? */
1193 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1194 return 0;
1195
469830d1
LP
1196 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1197 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
add00535 1198
469830d1
LP
1199 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1200
1201 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1202 if (r < 0)
1203 return r;
1204
1205 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1206 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1207 * altogether. */
1208 r = seccomp_rule_add_exact(
1209 seccomp,
1210 SCMP_ACT_ERRNO(EPERM),
1211 SCMP_SYS(setns),
1212 0);
1213 else
1214 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1215 * special invocation with a zero flags argument, right here. */
1216 r = seccomp_rule_add_exact(
1217 seccomp,
1218 SCMP_ACT_ERRNO(EPERM),
1219 SCMP_SYS(setns),
1220 1,
1221 SCMP_A1(SCMP_CMP_EQ, 0));
1222 if (r < 0) {
1223 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1224 continue;
1225 }
1226
077e8fc0 1227 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
469830d1
LP
1228 unsigned long f;
1229
1230 f = namespace_flag_map[i].flag;
1231 if ((retain & f) == f) {
1232 log_debug("Permitting %s.", namespace_flag_map[i].name);
1233 continue;
1234 }
1235
1236 log_debug("Blocking %s.", namespace_flag_map[i].name);
1237
1238 r = seccomp_rule_add_exact(
1239 seccomp,
1240 SCMP_ACT_ERRNO(EPERM),
1241 SCMP_SYS(unshare),
1242 1,
1243 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1244 if (r < 0) {
1245 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1246 break;
1247 }
1248
511ceb1f
ZJS
1249 /* On s390/s390x the first two parameters to clone are switched */
1250 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1251 r = seccomp_rule_add_exact(
1252 seccomp,
1253 SCMP_ACT_ERRNO(EPERM),
1254 SCMP_SYS(clone),
1255 1,
1256 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1257 else
1258 r = seccomp_rule_add_exact(
1259 seccomp,
1260 SCMP_ACT_ERRNO(EPERM),
1261 SCMP_SYS(clone),
1262 1,
1263 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1264 if (r < 0) {
1265 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1266 break;
1267 }
1268
1269 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1270 r = seccomp_rule_add_exact(
1271 seccomp,
1272 SCMP_ACT_ERRNO(EPERM),
1273 SCMP_SYS(setns),
1274 1,
1275 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1276 if (r < 0) {
1277 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1278 break;
1279 }
1280 }
1281 }
1282 if (r < 0)
1283 continue;
1284
1285 r = seccomp_load(seccomp);
7bc5e0b1 1286 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1287 return r;
1288 if (r < 0)
1289 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1290 }
1291
1292 return 0;
1293}
1294
1295int seccomp_protect_sysctl(void) {
1296 uint32_t arch;
1297 int r;
1298
1299 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1300 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1301
1302 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1303
f9252236
AJ
1304 if (IN_SET(arch,
1305 SCMP_ARCH_AARCH64,
1306#ifdef SCMP_ARCH_RISCV64
1307 SCMP_ARCH_RISCV64,
1308#endif
1309 SCMP_ARCH_X32
1310 ))
2e64e8f4
ZJS
1311 /* No _sysctl syscall */
1312 continue;
1313
469830d1
LP
1314 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1315 if (r < 0)
1316 return r;
1317
1318 r = seccomp_rule_add_exact(
add00535
LP
1319 seccomp,
1320 SCMP_ACT_ERRNO(EPERM),
469830d1 1321 SCMP_SYS(_sysctl),
add00535 1322 0);
469830d1
LP
1323 if (r < 0) {
1324 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1325 continue;
1326 }
1327
1328 r = seccomp_load(seccomp);
7bc5e0b1 1329 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1330 return r;
1331 if (r < 0)
1332 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1333 }
1334
1335 return 0;
1336}
1337
620dbdd2
KK
1338int seccomp_protect_syslog(void) {
1339 uint32_t arch;
1340 int r;
1341
1342 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1343 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1344
1345 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1346 if (r < 0)
1347 return r;
1348
1349 r = seccomp_rule_add_exact(
1350 seccomp,
1351 SCMP_ACT_ERRNO(EPERM),
1352 SCMP_SYS(syslog),
1353 0);
1354
1355 if (r < 0) {
1356 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1357 continue;
1358 }
1359
1360 r = seccomp_load(seccomp);
1361 if (ERRNO_IS_SECCOMP_FATAL(r))
1362 return r;
1363 if (r < 0)
1364 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1365 }
1366
1367 return 0;
1368}
1369
6b000af4 1370int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1371 uint32_t arch;
1372 int r;
1373
1374 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1375 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1376 bool supported;
469830d1
LP
1377
1378 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1379
9606bc4b
LP
1380 switch (arch) {
1381
1382 case SCMP_ARCH_X86_64:
1383 case SCMP_ARCH_X32:
1384 case SCMP_ARCH_ARM:
1385 case SCMP_ARCH_AARCH64:
0d9fca76 1386 case SCMP_ARCH_PPC:
da1921a5
ZJS
1387 case SCMP_ARCH_PPC64:
1388 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1389 case SCMP_ARCH_MIPSEL64N32:
1390 case SCMP_ARCH_MIPS64N32:
1391 case SCMP_ARCH_MIPSEL64:
1392 case SCMP_ARCH_MIPS64:
f9252236
AJ
1393#ifdef SCMP_ARCH_RISCV64
1394 case SCMP_ARCH_RISCV64:
1395#endif
9606bc4b
LP
1396 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1397 supported = true;
1398 break;
1399
9606bc4b
LP
1400 case SCMP_ARCH_S390:
1401 case SCMP_ARCH_S390X:
da1921a5 1402 case SCMP_ARCH_X86:
f5aeac14
JC
1403 case SCMP_ARCH_MIPSEL:
1404 case SCMP_ARCH_MIPS:
9606bc4b
LP
1405 default:
1406 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1407 * don't know */
1408 supported = false;
1409 break;
1410 }
1411
1412 if (!supported)
1413 continue;
1414
469830d1
LP
1415 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1416 if (r < 0)
1417 return r;
1418
6b000af4 1419 if (allow_list) {
077e8fc0 1420 int first = 0, last = 0;
469830d1
LP
1421 void *afp;
1422
6b000af4
LP
1423 /* If this is an allow list, we first block the address families that are out of
1424 * range and then everything that is not in the set. First, we find the lowest and
1425 * highest address family in the set. */
469830d1 1426
90e74a66 1427 SET_FOREACH(afp, address_families) {
077e8fc0 1428 int af = PTR_TO_INT(afp);
469830d1
LP
1429
1430 if (af <= 0 || af >= af_max())
1431 continue;
1432
1433 if (first == 0 || af < first)
1434 first = af;
1435
1436 if (last == 0 || af > last)
1437 last = af;
1438 }
1439
1440 assert((first == 0) == (last == 0));
1441
1442 if (first == 0) {
1443
1444 /* No entries in the valid range, block everything */
1445 r = seccomp_rule_add_exact(
1446 seccomp,
1447 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1448 SCMP_SYS(socket),
1449 0);
1450 if (r < 0) {
1451 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1452 continue;
1453 }
1454
1455 } else {
1456
1457 /* Block everything below the first entry */
1458 r = seccomp_rule_add_exact(
1459 seccomp,
1460 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1461 SCMP_SYS(socket),
1462 1,
1463 SCMP_A0(SCMP_CMP_LT, first));
1464 if (r < 0) {
1465 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1466 continue;
1467 }
1468
1469 /* Block everything above the last entry */
1470 r = seccomp_rule_add_exact(
1471 seccomp,
1472 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1473 SCMP_SYS(socket),
1474 1,
1475 SCMP_A0(SCMP_CMP_GT, last));
1476 if (r < 0) {
1477 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1478 continue;
1479 }
1480
1481 /* Block everything between the first and last entry */
077e8fc0 1482 for (int af = 1; af < af_max(); af++) {
469830d1
LP
1483
1484 if (set_contains(address_families, INT_TO_PTR(af)))
1485 continue;
1486
1487 r = seccomp_rule_add_exact(
1488 seccomp,
1489 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1490 SCMP_SYS(socket),
1491 1,
1492 SCMP_A0(SCMP_CMP_EQ, af));
1493 if (r < 0)
1494 break;
1495 }
469830d1
LP
1496 if (r < 0) {
1497 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1498 continue;
1499 }
1500 }
1501
1502 } else {
1503 void *af;
1504
6b000af4
LP
1505 /* If this is a deny list, then generate one rule for each address family that are
1506 * then combined in OR checks. */
469830d1 1507
90e74a66 1508 SET_FOREACH(af, address_families) {
469830d1
LP
1509 r = seccomp_rule_add_exact(
1510 seccomp,
1511 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1512 SCMP_SYS(socket),
1513 1,
1514 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1515 if (r < 0)
1516 break;
1517 }
469830d1
LP
1518 if (r < 0) {
1519 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1520 continue;
1521 }
1522 }
1523
1524 r = seccomp_load(seccomp);
7bc5e0b1 1525 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1526 return r;
1527 if (r < 0)
1528 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1529 }
1530
1531 return 0;
1532}
1533
1534int seccomp_restrict_realtime(void) {
1535 static const int permitted_policies[] = {
1536 SCHED_OTHER,
1537 SCHED_BATCH,
1538 SCHED_IDLE,
1539 };
1540
1541 int r, max_policy = 0;
1542 uint32_t arch;
1543 unsigned i;
1544
1545 /* Determine the highest policy constant we want to allow */
1546 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1547 if (permitted_policies[i] > max_policy)
1548 max_policy = permitted_policies[i];
1549
1550 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1551 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1552 int p;
1553
1554 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1555
1556 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1557 if (r < 0)
1558 return r;
1559
1560 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1561 * allow list. */
469830d1
LP
1562 for (p = 0; p < max_policy; p++) {
1563 bool good = false;
1564
6b000af4 1565 /* Check if this is in the allow list. */
469830d1
LP
1566 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1567 if (permitted_policies[i] == p) {
1568 good = true;
1569 break;
1570 }
1571
1572 if (good)
1573 continue;
1574
1575 /* Deny this policy */
1576 r = seccomp_rule_add_exact(
1577 seccomp,
1578 SCMP_ACT_ERRNO(EPERM),
1579 SCMP_SYS(sched_setscheduler),
1580 1,
1581 SCMP_A1(SCMP_CMP_EQ, p));
1582 if (r < 0) {
1583 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1584 continue;
1585 }
1586 }
1587
6b000af4
LP
1588 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1589 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1590 r = seccomp_rule_add_exact(
add00535
LP
1591 seccomp,
1592 SCMP_ACT_ERRNO(EPERM),
469830d1 1593 SCMP_SYS(sched_setscheduler),
add00535 1594 1,
469830d1
LP
1595 SCMP_A1(SCMP_CMP_GT, max_policy));
1596 if (r < 0) {
1597 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1598 continue;
1599 }
add00535 1600
469830d1 1601 r = seccomp_load(seccomp);
7bc5e0b1 1602 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1603 return r;
1604 if (r < 0)
1605 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1606 }
1607
1608 return 0;
1609}
1610
6dc66688
ZJS
1611static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1612 uint32_t arch,
1613 int nr,
14cb109d 1614 unsigned arg_cnt,
6dc66688
ZJS
1615 const struct scmp_arg_cmp arg) {
1616 int r;
1617
1618 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1619 if (r < 0) {
1620 _cleanup_free_ char *n = NULL;
1621
1622 n = seccomp_syscall_resolve_num_arch(arch, nr);
1623 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1624 strna(n),
1625 seccomp_arch_to_string(arch));
1626 }
1627
1628 return r;
1629}
1630
2a8d6e63 1631/* For known architectures, check that syscalls are indeed defined or not. */
f9252236 1632#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
2a8d6e63
ZJS
1633assert_cc(SCMP_SYS(shmget) > 0);
1634assert_cc(SCMP_SYS(shmat) > 0);
1635assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1636#endif
6dc66688 1637
469830d1
LP
1638int seccomp_memory_deny_write_execute(void) {
1639 uint32_t arch;
b069c2a3 1640 unsigned loaded = 0;
469830d1
LP
1641
1642 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1643 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1644 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1645
469830d1
LP
1646 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1647
8a50cf69
LP
1648 switch (arch) {
1649
bed4668d
CE
1650 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1651 * We ignore that here, which means there's still a way to get writable/executable
1652 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1653
8a50cf69 1654 case SCMP_ARCH_X86:
57311925 1655 case SCMP_ARCH_S390:
8a50cf69
LP
1656 filter_syscall = SCMP_SYS(mmap2);
1657 block_syscall = SCMP_SYS(mmap);
bed4668d 1658 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1659 break;
1660
63d00dfb 1661 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1662 case SCMP_ARCH_PPC64:
1663 case SCMP_ARCH_PPC64LE:
bed4668d 1664 case SCMP_ARCH_S390X:
2a8d6e63 1665 filter_syscall = SCMP_SYS(mmap);
bed4668d 1666 /* shmat multiplexed, see above */
8a50cf69
LP
1667 break;
1668
4278d1f5
ZJS
1669 case SCMP_ARCH_ARM:
1670 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1671 shmat_syscall = SCMP_SYS(shmat);
1672 break;
1673
8a50cf69
LP
1674 case SCMP_ARCH_X86_64:
1675 case SCMP_ARCH_X32:
79873bc8 1676 case SCMP_ARCH_AARCH64:
f9252236
AJ
1677#ifdef SCMP_ARCH_RISCV64
1678 case SCMP_ARCH_RISCV64:
1679#endif
1680 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
8a50cf69
LP
1681 shmat_syscall = SCMP_SYS(shmat);
1682 break;
1683
1684 /* Please add more definitions here, if you port systemd to other architectures! */
1685
f9252236 1686#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
8a50cf69
LP
1687#warning "Consider adding the right mmap() syscall definitions here!"
1688#endif
1689 }
1690
1691 /* Can't filter mmap() on this arch, then skip it */
1692 if (filter_syscall == 0)
1693 continue;
1694
469830d1
LP
1695 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1696 if (r < 0)
1697 return r;
1698
6dc66688
ZJS
1699 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1700 1,
1701 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1702 if (r < 0)
1703 continue;
8a50cf69
LP
1704
1705 if (block_syscall != 0) {
6dc66688
ZJS
1706 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1707 if (r < 0)
8a50cf69 1708 continue;
add00535 1709 }
a3be2849 1710
6dc66688
ZJS
1711 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1712 1,
b835eeb4
ZJS
1713 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1714 if (r < 0)
1715 continue;
1716
91691f1d 1717#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1718 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1719 1,
6dc66688
ZJS
1720 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1721 if (r < 0)
469830d1 1722 continue;
91691f1d 1723#endif
add00535 1724
67fb5f33 1725 if (shmat_syscall > 0) {
5ef3ed97 1726 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1727 1,
1728 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1729 if (r < 0)
8a50cf69 1730 continue;
469830d1
LP
1731 }
1732
1733 r = seccomp_load(seccomp);
7bc5e0b1 1734 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1 1735 return r;
add00535 1736 if (r < 0)
b069c2a3
ZJS
1737 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1738 seccomp_arch_to_string(arch));
903659e7 1739 loaded++;
469830d1 1740 }
add00535 1741
903659e7 1742 if (loaded == 0)
b069c2a3 1743 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1744
1745 return loaded;
469830d1
LP
1746}
1747
1748int seccomp_restrict_archs(Set *archs) {
1749 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
469830d1
LP
1750 void *id;
1751 int r;
1752
1753 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1754 * list.
1755 *
1756 * There are some qualifications. However the most important use is to stop processes from bypassing
1757 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1758 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1759
2428aaf8
AJ
1760 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1761 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1762 * to run a program with the restrictions applied. */
469830d1
LP
1763 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1764 if (!seccomp)
1765 return -ENOMEM;
1766
90e74a66 1767 SET_FOREACH(id, archs) {
469830d1 1768 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1769 if (r < 0 && r != -EEXIST)
1770 return r;
1771 }
1772
1773 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1774 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1775 * The important thing is that you can block the old 32-bit x86 syscalls.
1776 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1777
1778 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1779 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1780
1781 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1782 if (r < 0 && r != -EEXIST)
469830d1 1783 return r;
add00535
LP
1784 }
1785
469830d1
LP
1786 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1787 if (r < 0)
1788 return r;
add00535 1789
1c6af69b 1790 r = seccomp_load(seccomp);
7bc5e0b1 1791 if (ERRNO_IS_SECCOMP_FATAL(r))
1c6af69b
LP
1792 return r;
1793 if (r < 0)
1794 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1795
1796 return 0;
a3be2849 1797}
b16bd535 1798
de7fef4b
ZJS
1799int parse_syscall_archs(char **l, Set **ret_archs) {
1800 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
1801 char **s;
1802 int r;
1803
1804 assert(l);
de7fef4b 1805 assert(ret_archs);
b16bd535
YW
1806
1807 STRV_FOREACH(s, l) {
1808 uint32_t a;
1809
1810 r = seccomp_arch_from_string(*s, &a);
1811 if (r < 0)
1812 return -EINVAL;
1813
de7fef4b 1814 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
1815 if (r < 0)
1816 return -ENOMEM;
1817 }
1818
de7fef4b 1819 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
1820 return 0;
1821}
165a31c0 1822
8cfa775f 1823int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1824 const char *i;
1825 int r;
1826
1827 assert(set);
1828
1829 NULSTR_FOREACH(i, set->value) {
1830
1831 if (i[0] == '@') {
1832 const SyscallFilterSet *more;
1833
1834 more = syscall_filter_set_find(i);
1835 if (!more)
1836 return -ENXIO;
1837
165a31c0
LP
1838 r = seccomp_filter_set_add(filter, add, more);
1839 if (r < 0)
1840 return r;
1841 } else {
1842 int id;
1843
1844 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1845 if (id == __NR_SCMP_ERROR) {
1846 log_debug("Couldn't resolve system call, ignoring: %s", i);
1847 continue;
1848 }
165a31c0
LP
1849
1850 if (add) {
8cfa775f 1851 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1852 if (r < 0)
1853 return r;
1854 } else
8cfa775f 1855 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1856 }
1857 }
1858
1859 return 0;
1860}
78e864e5
TM
1861
1862int seccomp_lock_personality(unsigned long personality) {
72eafe71 1863 uint32_t arch;
78e864e5
TM
1864 int r;
1865
72eafe71
LP
1866 if (personality >= PERSONALITY_INVALID)
1867 return -EINVAL;
78e864e5 1868
72eafe71
LP
1869 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1870 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1871
72eafe71
LP
1872 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1873 if (r < 0)
1874 return r;
1875
1876 r = seccomp_rule_add_exact(
1877 seccomp,
1878 SCMP_ACT_ERRNO(EPERM),
1879 SCMP_SYS(personality),
1880 1,
1881 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1882 if (r < 0) {
1883 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1884 continue;
1885 }
72eafe71
LP
1886
1887 r = seccomp_load(seccomp);
7bc5e0b1 1888 if (ERRNO_IS_SECCOMP_FATAL(r))
72eafe71
LP
1889 return r;
1890 if (r < 0)
1891 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1892 }
1893
1894 return 0;
78e864e5 1895}
aecd5ac6
TM
1896
1897int seccomp_protect_hostname(void) {
1898 uint32_t arch;
1899 int r;
1900
1901 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1902 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1903
1904 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1905 if (r < 0)
1906 return r;
1907
1908 r = seccomp_rule_add_exact(
1909 seccomp,
1910 SCMP_ACT_ERRNO(EPERM),
1911 SCMP_SYS(sethostname),
1912 0);
9e6e543c
LP
1913 if (r < 0) {
1914 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1915 continue;
9e6e543c 1916 }
aecd5ac6
TM
1917
1918 r = seccomp_rule_add_exact(
1919 seccomp,
1920 SCMP_ACT_ERRNO(EPERM),
1921 SCMP_SYS(setdomainname),
1922 0);
9e6e543c
LP
1923 if (r < 0) {
1924 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1925 continue;
9e6e543c 1926 }
aecd5ac6
TM
1927
1928 r = seccomp_load(seccomp);
7bc5e0b1 1929 if (ERRNO_IS_SECCOMP_FATAL(r))
aecd5ac6
TM
1930 return r;
1931 if (r < 0)
1932 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1933 }
1934
1935 return 0;
1936}
3c27973b 1937
da4dc9a6
ZJS
1938static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1939 /* Checks the mode_t parameter of the following system calls:
1940 *
1941 * → chmod() + fchmod() + fchmodat()
1942 * → open() + creat() + openat()
1943 * → mkdir() + mkdirat()
1944 * → mknod() + mknodat()
1945 *
1946 * Returns error if *everything* failed, and 0 otherwise.
1947 */
6d95e7d9 1948 int r;
da4dc9a6
ZJS
1949 bool any = false;
1950
1951 r = seccomp_rule_add_exact(
1952 seccomp,
1953 SCMP_ACT_ERRNO(EPERM),
1954 SCMP_SYS(chmod),
1955 1,
1956 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1957 if (r < 0)
1958 log_debug_errno(r, "Failed to add filter for chmod: %m");
1959 else
1960 any = true;
1961
1962 r = seccomp_rule_add_exact(
1963 seccomp,
1964 SCMP_ACT_ERRNO(EPERM),
1965 SCMP_SYS(fchmod),
1966 1,
1967 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1968 if (r < 0)
1969 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1970 else
1971 any = true;
1972
1973 r = seccomp_rule_add_exact(
1974 seccomp,
1975 SCMP_ACT_ERRNO(EPERM),
1976 SCMP_SYS(fchmodat),
1977 1,
1978 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1979 if (r < 0)
1980 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1981 else
1982 any = true;
1983
1984 r = seccomp_rule_add_exact(
1985 seccomp,
1986 SCMP_ACT_ERRNO(EPERM),
1987 SCMP_SYS(mkdir),
1988 1,
1989 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1990 if (r < 0)
1991 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1992 else
1993 any = true;
1994
1995 r = seccomp_rule_add_exact(
1996 seccomp,
1997 SCMP_ACT_ERRNO(EPERM),
1998 SCMP_SYS(mkdirat),
1999 1,
2000 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2001 if (r < 0)
2002 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2003 else
2004 any = true;
2005
2006 r = seccomp_rule_add_exact(
2007 seccomp,
2008 SCMP_ACT_ERRNO(EPERM),
2009 SCMP_SYS(mknod),
2010 1,
2011 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2012 if (r < 0)
2013 log_debug_errno(r, "Failed to add filter for mknod: %m");
2014 else
2015 any = true;
2016
2017 r = seccomp_rule_add_exact(
2018 seccomp,
2019 SCMP_ACT_ERRNO(EPERM),
2020 SCMP_SYS(mknodat),
2021 1,
2022 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2023 if (r < 0)
2024 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2025 else
2026 any = true;
2027
2028#if SCMP_SYS(open) > 0
2029 r = seccomp_rule_add_exact(
2030 seccomp,
2031 SCMP_ACT_ERRNO(EPERM),
2032 SCMP_SYS(open),
2033 2,
2034 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2035 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2036 if (r < 0)
2037 log_debug_errno(r, "Failed to add filter for open: %m");
2038 else
2039 any = true;
2040#endif
2041
2042 r = seccomp_rule_add_exact(
2043 seccomp,
2044 SCMP_ACT_ERRNO(EPERM),
2045 SCMP_SYS(openat),
2046 2,
2047 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2048 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2049 if (r < 0)
2050 log_debug_errno(r, "Failed to add filter for openat: %m");
2051 else
2052 any = true;
2053
ecc04067
LP
2054#if defined(__SNR_openat2)
2055 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2056 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2057 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2058 * compatible with kernels that are not absolutely recent. */
2059 r = seccomp_rule_add_exact(
2060 seccomp,
2061 SCMP_ACT_ERRNO(EPERM),
2062 SCMP_SYS(openat2),
2063 0);
2064 if (r < 0)
2065 log_debug_errno(r, "Failed to add filter for openat2: %m");
2066 else
2067 any = true;
2068#endif
2069
da4dc9a6
ZJS
2070 r = seccomp_rule_add_exact(
2071 seccomp,
2072 SCMP_ACT_ERRNO(EPERM),
2073 SCMP_SYS(creat),
2074 1,
2075 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2076 if (r < 0)
2077 log_debug_errno(r, "Failed to add filter for creat: %m");
2078 else
2079 any = true;
2080
2081 return any ? 0 : r;
2082}
2083
3c27973b
LP
2084int seccomp_restrict_suid_sgid(void) {
2085 uint32_t arch;
da4dc9a6 2086 int r, k;
3c27973b
LP
2087
2088 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2089 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2090
2091 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2092 if (r < 0)
2093 return r;
2094
da4dc9a6
ZJS
2095 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2096 if (r < 0)
2097 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2098
da4dc9a6
ZJS
2099 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2100 if (k < 0)
2101 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2102
da4dc9a6 2103 if (r < 0 && k < 0)
3c27973b 2104 continue;
3c27973b
LP
2105
2106 r = seccomp_load(seccomp);
7bc5e0b1 2107 if (ERRNO_IS_SECCOMP_FATAL(r))
3c27973b
LP
2108 return r;
2109 if (r < 0)
2110 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2111 }
2112
2113 return 0;
2114}
915fb324
LP
2115
2116uint32_t scmp_act_kill_process(void) {
2117
2118 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2119 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2120 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2121 * for single-threaded apps does the right thing. */
2122
2123#ifdef SCMP_ACT_KILL_PROCESS
2124 if (seccomp_api_get() >= 3)
2125 return SCMP_ACT_KILL_PROCESS;
2126#endif
2127
2128 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2129}