]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
Merge pull request #17185 from yuwata/ethtool-update
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
57183d11 6#include <seccomp.h>
a8fbdf54 7#include <stddef.h>
469830d1 8#include <sys/mman.h>
d347d902 9#include <sys/prctl.h>
469830d1 10#include <sys/shm.h>
3c27973b 11#include <sys/stat.h>
57183d11 12
469830d1 13#include "af-list.h"
add00535 14#include "alloc-util.h"
44aaddad 15#include "env-util.h"
d8b4d14d 16#include "errno-list.h"
a8fbdf54 17#include "macro.h"
add00535 18#include "nsflags.h"
d8b4d14d 19#include "nulstr-util.h"
78e864e5 20#include "process-util.h"
cf0fbc49 21#include "seccomp-util.h"
b16bd535 22#include "set.h"
07630cea 23#include "string-util.h"
b16bd535 24#include "strv.h"
469830d1
LP
25
26const uint32_t seccomp_local_archs[] = {
27
6b000af4 28 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
29
30#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
31 SCMP_ARCH_X86,
32 SCMP_ARCH_X86_64,
f2d9751c
LP
33 SCMP_ARCH_X32, /* native */
34#elif defined(__x86_64__) && !defined(__ILP32__)
35 SCMP_ARCH_X86,
469830d1 36 SCMP_ARCH_X32,
f2d9751c
LP
37 SCMP_ARCH_X86_64, /* native */
38#elif defined(__i386__)
39 SCMP_ARCH_X86,
40#elif defined(__aarch64__)
469830d1 41 SCMP_ARCH_ARM,
f2d9751c
LP
42 SCMP_ARCH_AARCH64, /* native */
43#elif defined(__arm__)
44 SCMP_ARCH_ARM,
45#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPSEL,
47 SCMP_ARCH_MIPS, /* native */
48#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 49 SCMP_ARCH_MIPS,
f2d9751c
LP
50 SCMP_ARCH_MIPSEL, /* native */
51#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
52 SCMP_ARCH_MIPSEL,
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL64N32,
469830d1 55 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
56 SCMP_ARCH_MIPSEL64,
57 SCMP_ARCH_MIPS64, /* native */
58#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPS,
469830d1 60 SCMP_ARCH_MIPSEL,
f2d9751c
LP
61 SCMP_ARCH_MIPS64N32,
62 SCMP_ARCH_MIPSEL64N32,
63 SCMP_ARCH_MIPS64,
64 SCMP_ARCH_MIPSEL64, /* native */
65#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
469830d1 68 SCMP_ARCH_MIPSEL64,
f2d9751c 69 SCMP_ARCH_MIPS64,
469830d1 70 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
71 SCMP_ARCH_MIPS64N32, /* native */
72#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64,
76 SCMP_ARCH_MIPSEL64,
77 SCMP_ARCH_MIPS64N32,
78 SCMP_ARCH_MIPSEL64N32, /* native */
79#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 80 SCMP_ARCH_PPC,
469830d1 81 SCMP_ARCH_PPC64LE,
f2d9751c
LP
82 SCMP_ARCH_PPC64, /* native */
83#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
84 SCMP_ARCH_PPC,
85 SCMP_ARCH_PPC64,
86 SCMP_ARCH_PPC64LE, /* native */
87#elif defined(__powerpc__)
88 SCMP_ARCH_PPC,
f9252236
AJ
89#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
90 SCMP_ARCH_RISCV64,
f2d9751c
LP
91#elif defined(__s390x__)
92 SCMP_ARCH_S390,
93 SCMP_ARCH_S390X, /* native */
94#elif defined(__s390__)
469830d1 95 SCMP_ARCH_S390,
469830d1
LP
96#endif
97 (uint32_t) -1
98 };
57183d11
LP
99
100const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
101 /* Maintain order used in <seccomp.h>.
102 *
103 * Names used here should be the same as those used for ConditionArchitecture=,
104 * except for "subarchitectures" like x32. */
57183d11 105
aa34055f
ZJS
106 switch(c) {
107 case SCMP_ARCH_NATIVE:
57183d11 108 return "native";
aa34055f 109 case SCMP_ARCH_X86:
57183d11 110 return "x86";
aa34055f 111 case SCMP_ARCH_X86_64:
57183d11 112 return "x86-64";
aa34055f 113 case SCMP_ARCH_X32:
57183d11 114 return "x32";
aa34055f 115 case SCMP_ARCH_ARM:
57183d11 116 return "arm";
aa34055f
ZJS
117 case SCMP_ARCH_AARCH64:
118 return "arm64";
119 case SCMP_ARCH_MIPS:
120 return "mips";
121 case SCMP_ARCH_MIPS64:
122 return "mips64";
123 case SCMP_ARCH_MIPS64N32:
124 return "mips64-n32";
125 case SCMP_ARCH_MIPSEL:
126 return "mips-le";
127 case SCMP_ARCH_MIPSEL64:
128 return "mips64-le";
129 case SCMP_ARCH_MIPSEL64N32:
130 return "mips64-le-n32";
131 case SCMP_ARCH_PPC:
132 return "ppc";
133 case SCMP_ARCH_PPC64:
134 return "ppc64";
135 case SCMP_ARCH_PPC64LE:
136 return "ppc64-le";
f9252236
AJ
137#ifdef SCMP_ARCH_RISCV64
138 case SCMP_ARCH_RISCV64:
139 return "riscv64";
140#endif
aa34055f 141 case SCMP_ARCH_S390:
6abfd303 142 return "s390";
aa34055f 143 case SCMP_ARCH_S390X:
6abfd303 144 return "s390x";
aa34055f
ZJS
145 default:
146 return NULL;
147 }
57183d11
LP
148}
149
150int seccomp_arch_from_string(const char *n, uint32_t *ret) {
151 if (!n)
152 return -EINVAL;
153
154 assert(ret);
155
156 if (streq(n, "native"))
157 *ret = SCMP_ARCH_NATIVE;
158 else if (streq(n, "x86"))
159 *ret = SCMP_ARCH_X86;
160 else if (streq(n, "x86-64"))
161 *ret = SCMP_ARCH_X86_64;
162 else if (streq(n, "x32"))
163 *ret = SCMP_ARCH_X32;
164 else if (streq(n, "arm"))
165 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
166 else if (streq(n, "arm64"))
167 *ret = SCMP_ARCH_AARCH64;
168 else if (streq(n, "mips"))
169 *ret = SCMP_ARCH_MIPS;
170 else if (streq(n, "mips64"))
171 *ret = SCMP_ARCH_MIPS64;
172 else if (streq(n, "mips64-n32"))
173 *ret = SCMP_ARCH_MIPS64N32;
174 else if (streq(n, "mips-le"))
175 *ret = SCMP_ARCH_MIPSEL;
176 else if (streq(n, "mips64-le"))
177 *ret = SCMP_ARCH_MIPSEL64;
178 else if (streq(n, "mips64-le-n32"))
179 *ret = SCMP_ARCH_MIPSEL64N32;
180 else if (streq(n, "ppc"))
181 *ret = SCMP_ARCH_PPC;
182 else if (streq(n, "ppc64"))
183 *ret = SCMP_ARCH_PPC64;
184 else if (streq(n, "ppc64-le"))
185 *ret = SCMP_ARCH_PPC64LE;
f9252236
AJ
186#ifdef SCMP_ARCH_RISCV64
187 else if (streq(n, "riscv64"))
188 *ret = SCMP_ARCH_RISCV64;
189#endif
6abfd303
HB
190 else if (streq(n, "s390"))
191 *ret = SCMP_ARCH_S390;
192 else if (streq(n, "s390x"))
193 *ret = SCMP_ARCH_S390X;
57183d11
LP
194 else
195 return -EINVAL;
196
197 return 0;
198}
e9642be2 199
469830d1 200int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 201 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
202 int r;
203
469830d1
LP
204 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
205 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
206
207 seccomp = seccomp_init(default_action);
208 if (!seccomp)
209 return -ENOMEM;
210
469830d1
LP
211 if (arch != SCMP_ARCH_NATIVE &&
212 arch != seccomp_arch_native()) {
213
1b52793d 214 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 215 if (r < 0)
b4eaa6cc 216 return r;
469830d1 217
1b52793d 218 r = seccomp_arch_add(seccomp, arch);
469830d1 219 if (r < 0)
b4eaa6cc 220 return r;
469830d1
LP
221
222 assert(seccomp_arch_exist(seccomp, arch) >= 0);
223 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
224 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
225 } else {
226 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
227 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
228 }
229
230 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 231 if (r < 0)
b4eaa6cc 232 return r;
8d7b0c8f
LP
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
235 if (r < 0)
b4eaa6cc 236 return r;
8d7b0c8f 237
44aaddad
SD
238#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
239 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
240 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
241 if (r < 0)
242 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
243 }
244#endif
245
b4eaa6cc 246 *ret = TAKE_PTR(seccomp);
8d7b0c8f 247 return 0;
8d7b0c8f
LP
248}
249
d347d902 250static bool is_basic_seccomp_available(void) {
4d5bd50a 251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
252}
253
254static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
d347d902
FS
257}
258
83f12b27 259bool is_seccomp_available(void) {
83f12b27 260 static int cached_enabled = -1;
4d5bd50a 261
83f12b27 262 if (cached_enabled < 0)
4d5bd50a
LP
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
83f12b27
FS
267 return cached_enabled;
268}
269
8130926d 270const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 271 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 272 .name = "@default",
d5efc18b 273 .help = "System calls that are always permitted",
40eb6a80 274 .value =
8e24b1d2 275 "cacheflush\0"
40eb6a80 276 "clock_getres\0"
6ca67710 277 "clock_getres_time64\0"
40eb6a80 278 "clock_gettime\0"
6ca67710 279 "clock_gettime64\0"
40eb6a80 280 "clock_nanosleep\0"
6ca67710 281 "clock_nanosleep_time64\0"
40eb6a80
ZJS
282 "execve\0"
283 "exit\0"
284 "exit_group\0"
e41b0f42 285 "futex\0"
6ca67710 286 "futex_time64\0"
e41b0f42
LP
287 "get_robust_list\0"
288 "get_thread_area\0"
09d3020b
DH
289 "getegid\0"
290 "getegid32\0"
291 "geteuid\0"
292 "geteuid32\0"
293 "getgid\0"
294 "getgid32\0"
295 "getgroups\0"
296 "getgroups32\0"
297 "getpgid\0"
298 "getpgrp\0"
299 "getpid\0"
300 "getppid\0"
301 "getresgid\0"
302 "getresgid32\0"
303 "getresuid\0"
304 "getresuid32\0"
40eb6a80 305 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
306 "getsid\0"
307 "gettid\0"
40eb6a80 308 "gettimeofday\0"
09d3020b
DH
309 "getuid\0"
310 "getuid32\0"
e41b0f42 311 "membarrier\0"
40eb6a80
ZJS
312 "nanosleep\0"
313 "pause\0"
4c3a9176 314 "prlimit64\0"
e41b0f42 315 "restart_syscall\0"
6fee3be0 316 "rseq\0"
40eb6a80 317 "rt_sigreturn\0"
8f44de08 318 "sched_yield\0"
e41b0f42
LP
319 "set_robust_list\0"
320 "set_thread_area\0"
321 "set_tid_address\0"
ce5faeac 322 "set_tls\0"
40eb6a80
ZJS
323 "sigreturn\0"
324 "time\0"
4c3a9176 325 "ugetrlimit\0"
40eb6a80 326 },
44898c53
LP
327 [SYSCALL_FILTER_SET_AIO] = {
328 .name = "@aio",
329 .help = "Asynchronous IO",
330 .value =
331 "io_cancel\0"
332 "io_destroy\0"
333 "io_getevents\0"
a05cfe23 334 "io_pgetevents\0"
6ca67710 335 "io_pgetevents_time64\0"
44898c53
LP
336 "io_setup\0"
337 "io_submit\0"
9e486265
LP
338 "io_uring_enter\0"
339 "io_uring_register\0"
340 "io_uring_setup\0"
44898c53 341 },
133ddbbe 342 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 343 .name = "@basic-io",
d5efc18b 344 .help = "Basic IO",
133ddbbe 345 .value =
648a0ed0 346 "_llseek\0"
133ddbbe 347 "close\0"
6ea0d25c 348 "close_range\0"
648a0ed0 349 "dup\0"
133ddbbe
LP
350 "dup2\0"
351 "dup3\0"
133ddbbe
LP
352 "lseek\0"
353 "pread64\0"
354 "preadv\0"
44898c53 355 "preadv2\0"
133ddbbe
LP
356 "pwrite64\0"
357 "pwritev\0"
44898c53 358 "pwritev2\0"
133ddbbe
LP
359 "read\0"
360 "readv\0"
361 "write\0"
362 "writev\0"
363 },
44898c53
LP
364 [SYSCALL_FILTER_SET_CHOWN] = {
365 .name = "@chown",
366 .help = "Change ownership of files and directories",
367 .value =
368 "chown\0"
369 "chown32\0"
370 "fchown\0"
371 "fchown32\0"
372 "fchownat\0"
373 "lchown\0"
374 "lchown32\0"
375 },
8130926d 376 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 377 .name = "@clock",
d5efc18b 378 .help = "Change the system time",
201c1cc2
TM
379 .value =
380 "adjtimex\0"
1f9ac68b 381 "clock_adjtime\0"
6ca67710 382 "clock_adjtime64\0"
1f9ac68b 383 "clock_settime\0"
6ca67710 384 "clock_settime64\0"
201c1cc2 385 "settimeofday\0"
1f9ac68b 386 "stime\0"
8130926d
LP
387 },
388 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 389 .name = "@cpu-emulation",
d5efc18b 390 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
391 .value =
392 "modify_ldt\0"
393 "subpage_prot\0"
394 "switch_endian\0"
395 "vm86\0"
396 "vm86old\0"
8130926d
LP
397 },
398 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 399 .name = "@debug",
d5efc18b 400 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
401 .value =
402 "lookup_dcookie\0"
403 "perf_event_open\0"
8270e3d8 404 "pidfd_getfd\0"
1f9ac68b
LP
405 "ptrace\0"
406 "rtas\0"
6da432fd 407#if defined __s390__ || defined __s390x__
1f9ac68b 408 "s390_runtime_instr\0"
8130926d 409#endif
1f9ac68b 410 "sys_debug_setcontext\0"
8130926d 411 },
1a1b13c9
LP
412 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
413 .name = "@file-system",
414 .help = "File system operations",
415 .value =
416 "access\0"
417 "chdir\0"
418 "chmod\0"
419 "close\0"
420 "creat\0"
421 "faccessat\0"
bcf08acb 422 "faccessat2\0"
1a1b13c9
LP
423 "fallocate\0"
424 "fchdir\0"
425 "fchmod\0"
426 "fchmodat\0"
1a1b13c9 427 "fcntl\0"
ceaa6aa7 428 "fcntl64\0"
1a1b13c9
LP
429 "fgetxattr\0"
430 "flistxattr\0"
ceaa6aa7 431 "fremovexattr\0"
1a1b13c9 432 "fsetxattr\0"
1a1b13c9 433 "fstat\0"
ceaa6aa7 434 "fstat64\0"
1a1b13c9 435 "fstatat64\0"
1a1b13c9 436 "fstatfs\0"
ceaa6aa7 437 "fstatfs64\0"
1a1b13c9 438 "ftruncate\0"
ceaa6aa7 439 "ftruncate64\0"
1a1b13c9
LP
440 "futimesat\0"
441 "getcwd\0"
1a1b13c9 442 "getdents\0"
ceaa6aa7 443 "getdents64\0"
1a1b13c9
LP
444 "getxattr\0"
445 "inotify_add_watch\0"
ceaa6aa7 446 "inotify_init\0"
1a1b13c9
LP
447 "inotify_init1\0"
448 "inotify_rm_watch\0"
449 "lgetxattr\0"
450 "link\0"
451 "linkat\0"
452 "listxattr\0"
453 "llistxattr\0"
454 "lremovexattr\0"
455 "lsetxattr\0"
1a1b13c9 456 "lstat\0"
ceaa6aa7 457 "lstat64\0"
1a1b13c9
LP
458 "mkdir\0"
459 "mkdirat\0"
460 "mknod\0"
461 "mknodat\0"
1a1b13c9 462 "mmap\0"
ceaa6aa7 463 "mmap2\0"
7961116e 464 "munmap\0"
1a1b13c9 465 "newfstatat\0"
ceaa6aa7
LP
466 "oldfstat\0"
467 "oldlstat\0"
468 "oldstat\0"
1a1b13c9
LP
469 "open\0"
470 "openat\0"
8270e3d8 471 "openat2\0"
1a1b13c9
LP
472 "readlink\0"
473 "readlinkat\0"
474 "removexattr\0"
475 "rename\0"
1a1b13c9 476 "renameat\0"
ceaa6aa7 477 "renameat2\0"
1a1b13c9
LP
478 "rmdir\0"
479 "setxattr\0"
1a1b13c9 480 "stat\0"
ceaa6aa7 481 "stat64\0"
1a1b13c9 482 "statfs\0"
ceaa6aa7 483 "statfs64\0"
a4135a74 484 "statx\0"
1a1b13c9
LP
485 "symlink\0"
486 "symlinkat\0"
1a1b13c9 487 "truncate\0"
ceaa6aa7 488 "truncate64\0"
1a1b13c9
LP
489 "unlink\0"
490 "unlinkat\0"
ceaa6aa7 491 "utime\0"
1a1b13c9 492 "utimensat\0"
6ca67710 493 "utimensat_time64\0"
1a1b13c9
LP
494 "utimes\0"
495 },
8130926d 496 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 497 .name = "@io-event",
d5efc18b 498 .help = "Event loop system calls",
201c1cc2
TM
499 .value =
500 "_newselect\0"
201c1cc2 501 "epoll_create\0"
215728ff 502 "epoll_create1\0"
201c1cc2
TM
503 "epoll_ctl\0"
504 "epoll_ctl_old\0"
505 "epoll_pwait\0"
506 "epoll_wait\0"
507 "epoll_wait_old\0"
201c1cc2 508 "eventfd\0"
215728ff 509 "eventfd2\0"
201c1cc2
TM
510 "poll\0"
511 "ppoll\0"
6ca67710 512 "ppoll_time64\0"
201c1cc2 513 "pselect6\0"
6ca67710 514 "pselect6_time64\0"
201c1cc2 515 "select\0"
8130926d
LP
516 },
517 [SYSCALL_FILTER_SET_IPC] = {
8130926d 518 .name = "@ipc",
d5efc18b
ZJS
519 .help = "SysV IPC, POSIX Message Queues or other IPC",
520 .value =
521 "ipc\0"
cd5bfd7e 522 "memfd_create\0"
201c1cc2
TM
523 "mq_getsetattr\0"
524 "mq_notify\0"
525 "mq_open\0"
526 "mq_timedreceive\0"
6ca67710 527 "mq_timedreceive_time64\0"
201c1cc2 528 "mq_timedsend\0"
6ca67710 529 "mq_timedsend_time64\0"
201c1cc2
TM
530 "mq_unlink\0"
531 "msgctl\0"
532 "msgget\0"
533 "msgrcv\0"
534 "msgsnd\0"
cd5bfd7e 535 "pipe\0"
215728ff 536 "pipe2\0"
201c1cc2
TM
537 "process_vm_readv\0"
538 "process_vm_writev\0"
539 "semctl\0"
540 "semget\0"
541 "semop\0"
542 "semtimedop\0"
6ca67710 543 "semtimedop_time64\0"
201c1cc2
TM
544 "shmat\0"
545 "shmctl\0"
546 "shmdt\0"
547 "shmget\0"
8130926d
LP
548 },
549 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 550 .name = "@keyring",
d5efc18b 551 .help = "Kernel keyring access",
1f9ac68b
LP
552 .value =
553 "add_key\0"
554 "keyctl\0"
555 "request_key\0"
8130926d 556 },
cd0ddf6f
LP
557 [SYSCALL_FILTER_SET_MEMLOCK] = {
558 .name = "@memlock",
559 .help = "Memory locking control",
560 .value =
561 "mlock\0"
562 "mlock2\0"
563 "mlockall\0"
564 "munlock\0"
565 "munlockall\0"
566 },
8130926d 567 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 568 .name = "@module",
d5efc18b 569 .help = "Loading and unloading of kernel modules",
201c1cc2 570 .value =
201c1cc2
TM
571 "delete_module\0"
572 "finit_module\0"
573 "init_module\0"
8130926d
LP
574 },
575 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 576 .name = "@mount",
d5efc18b 577 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
578 .value =
579 "chroot\0"
9e486265
LP
580 "fsconfig\0"
581 "fsmount\0"
582 "fsopen\0"
583 "fspick\0"
201c1cc2 584 "mount\0"
9e486265
LP
585 "move_mount\0"
586 "open_tree\0"
201c1cc2 587 "pivot_root\0"
201c1cc2 588 "umount\0"
215728ff 589 "umount2\0"
8130926d
LP
590 },
591 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 592 .name = "@network-io",
d5efc18b 593 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 594 .value =
201c1cc2 595 "accept\0"
215728ff 596 "accept4\0"
201c1cc2
TM
597 "bind\0"
598 "connect\0"
599 "getpeername\0"
600 "getsockname\0"
601 "getsockopt\0"
602 "listen\0"
603 "recv\0"
604 "recvfrom\0"
605 "recvmmsg\0"
6ca67710 606 "recvmmsg_time64\0"
201c1cc2
TM
607 "recvmsg\0"
608 "send\0"
609 "sendmmsg\0"
610 "sendmsg\0"
611 "sendto\0"
612 "setsockopt\0"
613 "shutdown\0"
614 "socket\0"
615 "socketcall\0"
616 "socketpair\0"
8130926d
LP
617 },
618 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 619 /* some unknown even to libseccomp */
8130926d 620 .name = "@obsolete",
d5efc18b 621 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
622 .value =
623 "_sysctl\0"
624 "afs_syscall\0"
802fa07a 625 "bdflush\0"
201c1cc2 626 "break\0"
1f9ac68b 627 "create_module\0"
201c1cc2
TM
628 "ftime\0"
629 "get_kernel_syms\0"
201c1cc2
TM
630 "getpmsg\0"
631 "gtty\0"
7e0c3b8f 632 "idle\0"
201c1cc2 633 "lock\0"
201c1cc2 634 "mpx\0"
201c1cc2
TM
635 "prof\0"
636 "profil\0"
201c1cc2
TM
637 "putpmsg\0"
638 "query_module\0"
201c1cc2
TM
639 "security\0"
640 "sgetmask\0"
641 "ssetmask\0"
642 "stty\0"
1f9ac68b 643 "sysfs\0"
201c1cc2
TM
644 "tuxcall\0"
645 "ulimit\0"
646 "uselib\0"
1f9ac68b 647 "ustat\0"
201c1cc2 648 "vserver\0"
8130926d 649 },
9493b168
ZJS
650 [SYSCALL_FILTER_SET_PKEY] = {
651 .name = "@pkey",
652 .help = "System calls used for memory protection keys",
653 .value =
654 "pkey_alloc\0"
655 "pkey_free\0"
656 "pkey_mprotect\0"
657 },
8130926d 658 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 659 .name = "@privileged",
d5efc18b 660 .help = "All system calls which need super-user capabilities",
201c1cc2 661 .value =
44898c53 662 "@chown\0"
201c1cc2
TM
663 "@clock\0"
664 "@module\0"
665 "@raw-io\0"
af0f047b
LP
666 "@reboot\0"
667 "@swap\0"
215728ff 668 "_sysctl\0"
201c1cc2 669 "acct\0"
201c1cc2 670 "bpf\0"
1f9ac68b 671 "capset\0"
201c1cc2 672 "chroot\0"
a05cfe23 673 "fanotify_init\0"
9e486265 674 "fanotify_mark\0"
201c1cc2 675 "nfsservctl\0"
a05cfe23 676 "open_by_handle_at\0"
201c1cc2
TM
677 "pivot_root\0"
678 "quotactl\0"
201c1cc2 679 "setdomainname\0"
201c1cc2 680 "setfsuid\0"
215728ff 681 "setfsuid32\0"
201c1cc2 682 "setgroups\0"
215728ff 683 "setgroups32\0"
201c1cc2 684 "sethostname\0"
201c1cc2 685 "setresuid\0"
215728ff 686 "setresuid32\0"
201c1cc2 687 "setreuid\0"
215728ff 688 "setreuid32\0"
e05ee49b 689 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 690 "setuid32\0"
201c1cc2 691 "vhangup\0"
8130926d
LP
692 },
693 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 694 .name = "@process",
7b121df6 695 .help = "Process control, execution, namespacing operations",
201c1cc2
TM
696 .value =
697 "arch_prctl\0"
09d3020b 698 "capget\0" /* Able to query arbitrary processes */
201c1cc2 699 "clone\0"
9e486265 700 "clone3\0"
201c1cc2
TM
701 "execveat\0"
702 "fork\0"
b887d2eb 703 "getrusage\0"
201c1cc2 704 "kill\0"
9e486265 705 "pidfd_open\0"
46fcf95d 706 "pidfd_send_signal\0"
201c1cc2 707 "prctl\0"
b887d2eb
LP
708 "rt_sigqueueinfo\0"
709 "rt_tgsigqueueinfo\0"
201c1cc2 710 "setns\0"
a9518dc3 711 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 712 "tgkill\0"
b887d2eb 713 "times\0"
201c1cc2
TM
714 "tkill\0"
715 "unshare\0"
716 "vfork\0"
b887d2eb
LP
717 "wait4\0"
718 "waitid\0"
719 "waitpid\0"
8130926d
LP
720 },
721 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 722 .name = "@raw-io",
d5efc18b 723 .help = "Raw I/O port access",
201c1cc2
TM
724 .value =
725 "ioperm\0"
726 "iopl\0"
1f9ac68b 727 "pciconfig_iobase\0"
201c1cc2
TM
728 "pciconfig_read\0"
729 "pciconfig_write\0"
6da432fd 730#if defined __s390__ || defined __s390x__
201c1cc2
TM
731 "s390_pci_mmio_read\0"
732 "s390_pci_mmio_write\0"
8130926d
LP
733#endif
734 },
bd2ab3f4
LP
735 [SYSCALL_FILTER_SET_REBOOT] = {
736 .name = "@reboot",
737 .help = "Reboot and reboot preparation/kexec",
738 .value =
bd2ab3f4 739 "kexec_file_load\0"
e59608fa 740 "kexec_load\0"
bd2ab3f4
LP
741 "reboot\0"
742 },
133ddbbe 743 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 744 .name = "@resources",
58a8f68b 745 .help = "Alter resource settings",
133ddbbe 746 .value =
0963c053
LP
747 "ioprio_set\0"
748 "mbind\0"
749 "migrate_pages\0"
750 "move_pages\0"
751 "nice\0"
0963c053
LP
752 "sched_setaffinity\0"
753 "sched_setattr\0"
133ddbbe
LP
754 "sched_setparam\0"
755 "sched_setscheduler\0"
0963c053 756 "set_mempolicy\0"
133ddbbe
LP
757 "setpriority\0"
758 "setrlimit\0"
133ddbbe 759 },
6eaaeee9
LP
760 [SYSCALL_FILTER_SET_SETUID] = {
761 .name = "@setuid",
762 .help = "Operations for changing user/group credentials",
763 .value =
6eaaeee9 764 "setgid\0"
215728ff 765 "setgid32\0"
6eaaeee9 766 "setgroups\0"
215728ff 767 "setgroups32\0"
6eaaeee9 768 "setregid\0"
215728ff 769 "setregid32\0"
6eaaeee9 770 "setresgid\0"
215728ff 771 "setresgid32\0"
6eaaeee9 772 "setresuid\0"
215728ff 773 "setresuid32\0"
6eaaeee9 774 "setreuid\0"
215728ff 775 "setreuid32\0"
6eaaeee9 776 "setuid\0"
215728ff 777 "setuid32\0"
6eaaeee9 778 },
cd0ddf6f
LP
779 [SYSCALL_FILTER_SET_SIGNAL] = {
780 .name = "@signal",
781 .help = "Process signal handling",
782 .value =
783 "rt_sigaction\0"
784 "rt_sigpending\0"
785 "rt_sigprocmask\0"
786 "rt_sigsuspend\0"
787 "rt_sigtimedwait\0"
6ca67710 788 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
789 "sigaction\0"
790 "sigaltstack\0"
791 "signal\0"
792 "signalfd\0"
793 "signalfd4\0"
794 "sigpending\0"
795 "sigprocmask\0"
796 "sigsuspend\0"
797 },
bd2ab3f4
LP
798 [SYSCALL_FILTER_SET_SWAP] = {
799 .name = "@swap",
800 .help = "Enable/disable swap devices",
801 .value =
802 "swapoff\0"
803 "swapon\0"
804 },
44898c53
LP
805 [SYSCALL_FILTER_SET_SYNC] = {
806 .name = "@sync",
807 .help = "Synchronize files and memory to storage",
808 .value =
809 "fdatasync\0"
810 "fsync\0"
811 "msync\0"
812 "sync\0"
813 "sync_file_range\0"
a8fb09f5 814 "sync_file_range2\0"
44898c53
LP
815 "syncfs\0"
816 },
70526841
LP
817 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
818 .name = "@system-service",
819 .help = "General system service operations",
820 .value =
821 "@aio\0"
822 "@basic-io\0"
823 "@chown\0"
824 "@default\0"
825 "@file-system\0"
826 "@io-event\0"
827 "@ipc\0"
828 "@keyring\0"
829 "@memlock\0"
830 "@network-io\0"
831 "@process\0"
832 "@resources\0"
833 "@setuid\0"
834 "@signal\0"
835 "@sync\0"
836 "@timer\0"
837 "brk\0"
838 "capget\0"
839 "capset\0"
840 "copy_file_range\0"
841 "fadvise64\0"
842 "fadvise64_64\0"
843 "flock\0"
844 "get_mempolicy\0"
845 "getcpu\0"
846 "getpriority\0"
847 "getrandom\0"
848 "ioctl\0"
849 "ioprio_get\0"
850 "kcmp\0"
851 "madvise\0"
70526841
LP
852 "mprotect\0"
853 "mremap\0"
854 "name_to_handle_at\0"
855 "oldolduname\0"
856 "olduname\0"
857 "personality\0"
858 "readahead\0"
859 "readdir\0"
860 "remap_file_pages\0"
861 "sched_get_priority_max\0"
862 "sched_get_priority_min\0"
863 "sched_getaffinity\0"
864 "sched_getattr\0"
865 "sched_getparam\0"
866 "sched_getscheduler\0"
867 "sched_rr_get_interval\0"
6ca67710 868 "sched_rr_get_interval_time64\0"
70526841
LP
869 "sched_yield\0"
870 "sendfile\0"
871 "sendfile64\0"
872 "setfsgid\0"
873 "setfsgid32\0"
874 "setfsuid\0"
875 "setfsuid32\0"
876 "setpgid\0"
877 "setsid\0"
878 "splice\0"
879 "sysinfo\0"
880 "tee\0"
881 "umask\0"
882 "uname\0"
883 "userfaultfd\0"
884 "vmsplice\0"
885 },
cd0ddf6f
LP
886 [SYSCALL_FILTER_SET_TIMER] = {
887 .name = "@timer",
888 .help = "Schedule operations by time",
889 .value =
890 "alarm\0"
891 "getitimer\0"
892 "setitimer\0"
893 "timer_create\0"
894 "timer_delete\0"
895 "timer_getoverrun\0"
896 "timer_gettime\0"
6ca67710 897 "timer_gettime64\0"
cd0ddf6f 898 "timer_settime\0"
6ca67710 899 "timer_settime64\0"
cd0ddf6f
LP
900 "timerfd_create\0"
901 "timerfd_gettime\0"
6ca67710 902 "timerfd_gettime64\0"
cd0ddf6f 903 "timerfd_settime\0"
6ca67710 904 "timerfd_settime64\0"
cd0ddf6f
LP
905 "times\0"
906 },
95aac012
ZJS
907 [SYSCALL_FILTER_SET_KNOWN] = {
908 .name = "@known",
909 .help = "All known syscalls declared in the kernel",
910 .value =
911#include "syscall-list.h"
912 },
201c1cc2 913};
8130926d
LP
914
915const SyscallFilterSet *syscall_filter_set_find(const char *name) {
8130926d
LP
916 if (isempty(name) || name[0] != '@')
917 return NULL;
918
077e8fc0 919 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
8130926d
LP
920 if (streq(syscall_filter_sets[i].name, name))
921 return syscall_filter_sets + i;
922
923 return NULL;
924}
925
000c0520
ZJS
926static int add_syscall_filter_set(
927 scmp_filter_ctx seccomp,
928 const SyscallFilterSet *set,
929 uint32_t action,
930 char **exclude,
931 bool log_missing,
932 char ***added);
933
934int seccomp_add_syscall_filter_item(
935 scmp_filter_ctx *seccomp,
936 const char *name,
937 uint32_t action,
938 char **exclude,
939 bool log_missing,
940 char ***added) {
69b1b241
LP
941
942 assert(seccomp);
943 assert(name);
944
960e4569
LP
945 if (strv_contains(exclude, name))
946 return 0;
947
000c0520
ZJS
948 /* Any syscalls that are handled are added to the *added strv. The pointer
949 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
950
69b1b241
LP
951 if (name[0] == '@') {
952 const SyscallFilterSet *other;
953
954 other = syscall_filter_set_find(name);
baaa35ad
ZJS
955 if (!other)
956 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
957 "Filter set %s is not known!",
958 name);
69b1b241 959
000c0520 960 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
b54f36c6 961
69b1b241 962 } else {
b54f36c6 963 int id, r;
69b1b241
LP
964
965 id = seccomp_syscall_resolve_name(name);
cff7bff8 966 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
967 if (log_missing)
968 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 969 return 0;
cff7bff8 970 }
69b1b241
LP
971
972 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 973 if (r < 0) {
69b1b241 974 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
975 bool ignore = r == -EDOM;
976
977 if (!ignore || log_missing)
978 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
979 name, id, ignore ? ", ignoring" : "");
980 if (!ignore)
981 return r;
b54f36c6 982 }
69b1b241 983
000c0520
ZJS
984 if (added) {
985 r = strv_extend(added, name);
986 if (r < 0)
987 return r;
988 }
989
b54f36c6
ZJS
990 return 0;
991 }
69b1b241
LP
992}
993
000c0520 994static int add_syscall_filter_set(
469830d1 995 scmp_filter_ctx seccomp,
469830d1 996 const SyscallFilterSet *set,
960e4569 997 uint32_t action,
b54f36c6 998 char **exclude,
000c0520
ZJS
999 bool log_missing,
1000 char ***added) {
469830d1 1001
8130926d
LP
1002 const char *sys;
1003 int r;
1004
000c0520
ZJS
1005 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1006
8130926d
LP
1007 assert(seccomp);
1008 assert(set);
1009
1010 NULSTR_FOREACH(sys, set->value) {
000c0520 1011 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
69b1b241
LP
1012 if (r < 0)
1013 return r;
469830d1
LP
1014 }
1015
1016 return 0;
1017}
1018
b54f36c6 1019int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
1020 uint32_t arch;
1021 int r;
1022
1023 assert(set);
1024
1025 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 1026 * each local arch. */
469830d1
LP
1027
1028 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1029 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1030
1031 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1032
1033 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
1034 if (r < 0)
1035 return r;
469830d1 1036
000c0520 1037 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
7e86bd73
ZJS
1038 if (r < 0)
1039 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
1040
1041 r = seccomp_load(seccomp);
7bc5e0b1 1042 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1043 return r;
1044 if (r < 0)
1045 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
1046 }
1047
1048 return 0;
1049}
a3be2849 1050
b54f36c6 1051int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 1052 uint32_t arch;
a3be2849
LP
1053 int r;
1054
469830d1
LP
1055 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1056 * SyscallFilterSet* table. */
a3be2849 1057
8cfa775f 1058 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 1059 return 0;
a3be2849 1060
469830d1
LP
1061 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1062 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b54f36c6 1063 void *syscall_id, *val;
a3be2849 1064
469830d1 1065 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1066
469830d1
LP
1067 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1068 if (r < 0)
1069 return r;
a3be2849 1070
90e74a66 1071 HASHMAP_FOREACH_KEY(val, syscall_id, set) {
8cfa775f 1072 uint32_t a = action;
b54f36c6
ZJS
1073 int id = PTR_TO_INT(syscall_id) - 1;
1074 int error = PTR_TO_INT(val);
8cfa775f 1075
005bfaf1
TM
1076 if (error == SECCOMP_ERROR_NUMBER_KILL)
1077 a = scmp_act_kill_process();
9df2cdd8
TM
1078#ifdef SCMP_ACT_LOG
1079 else if (action == SCMP_ACT_LOG)
1080 a = SCMP_ACT_LOG;
1081#endif
005bfaf1 1082 else if (action != SCMP_ACT_ALLOW && error >= 0)
b54f36c6 1083 a = SCMP_ACT_ERRNO(error);
8cfa775f 1084
b54f36c6 1085 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
1086 if (r < 0) {
1087 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1088 _cleanup_free_ char *n = NULL;
7e86bd73 1089 bool ignore;
469830d1 1090
b54f36c6 1091 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1092 ignore = r == -EDOM;
1093 if (!ignore || log_missing)
1094 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1095 strna(n), id, ignore ? ", ignoring" : "");
1096 if (!ignore)
1097 return r;
469830d1
LP
1098 }
1099 }
1100
1101 r = seccomp_load(seccomp);
7bc5e0b1 1102 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1103 return r;
1104 if (r < 0)
1105 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1106 }
1107
1108 return 0;
add00535
LP
1109}
1110
58f6ab44 1111int seccomp_parse_syscall_filter(
898748d8
YW
1112 const char *name,
1113 int errno_num,
1114 Hashmap *filter,
13d92c63 1115 SeccompParseFlags flags,
898748d8
YW
1116 const char *unit,
1117 const char *filename,
1118 unsigned line) {
1119
1120 int r;
1121
1122 assert(name);
1123 assert(filter);
1124
1125 if (name[0] == '@') {
1126 const SyscallFilterSet *set;
1127 const char *i;
1128
1129 set = syscall_filter_set_find(name);
1130 if (!set) {
13d92c63 1131 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1132 return -EINVAL;
13d92c63
LP
1133
1134 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1135 "Unknown system call group, ignoring: %s", name);
1136 return 0;
898748d8
YW
1137 }
1138
1139 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1140 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1141 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1142 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1143 * about them. */
58f6ab44 1144 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1145 if (r < 0)
1146 return r;
1147 }
1148 } else {
1149 int id;
1150
1151 id = seccomp_syscall_resolve_name(name);
1152 if (id == __NR_SCMP_ERROR) {
13d92c63 1153 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1154 return -EINVAL;
13d92c63
LP
1155
1156 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1157 "Failed to parse system call, ignoring: %s", name);
1158 return 0;
898748d8
YW
1159 }
1160
1161 /* If we previously wanted to forbid a syscall and now
1162 * we want to allow it, then remove it from the list. */
6b000af4 1163 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
898748d8
YW
1164 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1165 if (r < 0)
851ee70a
LW
1166 switch (r) {
1167 case -ENOMEM:
1168 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1169 case -EEXIST:
9d7fe7c6
LW
1170 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1171 break;
851ee70a
LW
1172 default:
1173 return r;
1174 }
898748d8
YW
1175 } else
1176 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1177 }
1178
1179 return 0;
1180}
1181
add00535 1182int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1183 uint32_t arch;
add00535
LP
1184 int r;
1185
f1d34068 1186 if (DEBUG_LOGGING) {
add00535
LP
1187 _cleanup_free_ char *s = NULL;
1188
86c2a9f1 1189 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1190 log_debug("Restricting namespace to: %s.", strna(s));
1191 }
1192
1193 /* NOOP? */
d7a0f1f4 1194 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
add00535
LP
1195 return 0;
1196
469830d1
LP
1197 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1198 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
add00535 1199
469830d1
LP
1200 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1201
1202 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1203 if (r < 0)
1204 return r;
1205
1206 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1207 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1208 * altogether. */
1209 r = seccomp_rule_add_exact(
1210 seccomp,
1211 SCMP_ACT_ERRNO(EPERM),
1212 SCMP_SYS(setns),
1213 0);
1214 else
1215 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1216 * special invocation with a zero flags argument, right here. */
1217 r = seccomp_rule_add_exact(
1218 seccomp,
1219 SCMP_ACT_ERRNO(EPERM),
1220 SCMP_SYS(setns),
1221 1,
1222 SCMP_A1(SCMP_CMP_EQ, 0));
1223 if (r < 0) {
1224 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1225 continue;
1226 }
1227
077e8fc0 1228 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
469830d1
LP
1229 unsigned long f;
1230
1231 f = namespace_flag_map[i].flag;
d7a0f1f4 1232 if (FLAGS_SET(retain, f)) {
469830d1
LP
1233 log_debug("Permitting %s.", namespace_flag_map[i].name);
1234 continue;
1235 }
1236
1237 log_debug("Blocking %s.", namespace_flag_map[i].name);
1238
1239 r = seccomp_rule_add_exact(
1240 seccomp,
1241 SCMP_ACT_ERRNO(EPERM),
1242 SCMP_SYS(unshare),
1243 1,
1244 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1245 if (r < 0) {
1246 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1247 break;
1248 }
1249
511ceb1f
ZJS
1250 /* On s390/s390x the first two parameters to clone are switched */
1251 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1252 r = seccomp_rule_add_exact(
1253 seccomp,
1254 SCMP_ACT_ERRNO(EPERM),
1255 SCMP_SYS(clone),
1256 1,
1257 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1258 else
1259 r = seccomp_rule_add_exact(
1260 seccomp,
1261 SCMP_ACT_ERRNO(EPERM),
1262 SCMP_SYS(clone),
1263 1,
1264 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1265 if (r < 0) {
1266 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1267 break;
1268 }
1269
1270 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1271 r = seccomp_rule_add_exact(
1272 seccomp,
1273 SCMP_ACT_ERRNO(EPERM),
1274 SCMP_SYS(setns),
1275 1,
1276 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1277 if (r < 0) {
1278 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1279 break;
1280 }
1281 }
1282 }
1283 if (r < 0)
1284 continue;
1285
1286 r = seccomp_load(seccomp);
7bc5e0b1 1287 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1288 return r;
1289 if (r < 0)
1290 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1291 }
1292
1293 return 0;
1294}
1295
1296int seccomp_protect_sysctl(void) {
1297 uint32_t arch;
1298 int r;
1299
1300 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1301 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1302
1303 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1304
f9252236
AJ
1305 if (IN_SET(arch,
1306 SCMP_ARCH_AARCH64,
1307#ifdef SCMP_ARCH_RISCV64
1308 SCMP_ARCH_RISCV64,
1309#endif
1310 SCMP_ARCH_X32
1311 ))
2e64e8f4
ZJS
1312 /* No _sysctl syscall */
1313 continue;
1314
469830d1
LP
1315 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1316 if (r < 0)
1317 return r;
1318
1319 r = seccomp_rule_add_exact(
add00535
LP
1320 seccomp,
1321 SCMP_ACT_ERRNO(EPERM),
469830d1 1322 SCMP_SYS(_sysctl),
add00535 1323 0);
469830d1
LP
1324 if (r < 0) {
1325 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1326 continue;
1327 }
1328
1329 r = seccomp_load(seccomp);
7bc5e0b1 1330 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1331 return r;
1332 if (r < 0)
1333 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1334 }
1335
1336 return 0;
1337}
1338
620dbdd2
KK
1339int seccomp_protect_syslog(void) {
1340 uint32_t arch;
1341 int r;
1342
1343 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1344 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1345
1346 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1347 if (r < 0)
1348 return r;
1349
1350 r = seccomp_rule_add_exact(
1351 seccomp,
1352 SCMP_ACT_ERRNO(EPERM),
1353 SCMP_SYS(syslog),
1354 0);
1355
1356 if (r < 0) {
1357 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1358 continue;
1359 }
1360
1361 r = seccomp_load(seccomp);
1362 if (ERRNO_IS_SECCOMP_FATAL(r))
1363 return r;
1364 if (r < 0)
1365 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1366 }
1367
1368 return 0;
1369}
1370
6b000af4 1371int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1372 uint32_t arch;
1373 int r;
1374
1375 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1376 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1377 bool supported;
469830d1
LP
1378
1379 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1380
9606bc4b
LP
1381 switch (arch) {
1382
1383 case SCMP_ARCH_X86_64:
1384 case SCMP_ARCH_X32:
1385 case SCMP_ARCH_ARM:
1386 case SCMP_ARCH_AARCH64:
0d9fca76 1387 case SCMP_ARCH_PPC:
da1921a5
ZJS
1388 case SCMP_ARCH_PPC64:
1389 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1390 case SCMP_ARCH_MIPSEL64N32:
1391 case SCMP_ARCH_MIPS64N32:
1392 case SCMP_ARCH_MIPSEL64:
1393 case SCMP_ARCH_MIPS64:
f9252236
AJ
1394#ifdef SCMP_ARCH_RISCV64
1395 case SCMP_ARCH_RISCV64:
1396#endif
9606bc4b
LP
1397 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1398 supported = true;
1399 break;
1400
9606bc4b
LP
1401 case SCMP_ARCH_S390:
1402 case SCMP_ARCH_S390X:
da1921a5 1403 case SCMP_ARCH_X86:
f5aeac14
JC
1404 case SCMP_ARCH_MIPSEL:
1405 case SCMP_ARCH_MIPS:
9606bc4b
LP
1406 default:
1407 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1408 * don't know */
1409 supported = false;
1410 break;
1411 }
1412
1413 if (!supported)
1414 continue;
1415
469830d1
LP
1416 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1417 if (r < 0)
1418 return r;
1419
6b000af4 1420 if (allow_list) {
077e8fc0 1421 int first = 0, last = 0;
469830d1
LP
1422 void *afp;
1423
6b000af4
LP
1424 /* If this is an allow list, we first block the address families that are out of
1425 * range and then everything that is not in the set. First, we find the lowest and
1426 * highest address family in the set. */
469830d1 1427
90e74a66 1428 SET_FOREACH(afp, address_families) {
077e8fc0 1429 int af = PTR_TO_INT(afp);
469830d1
LP
1430
1431 if (af <= 0 || af >= af_max())
1432 continue;
1433
1434 if (first == 0 || af < first)
1435 first = af;
1436
1437 if (last == 0 || af > last)
1438 last = af;
1439 }
1440
1441 assert((first == 0) == (last == 0));
1442
1443 if (first == 0) {
1444
1445 /* No entries in the valid range, block everything */
1446 r = seccomp_rule_add_exact(
1447 seccomp,
1448 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1449 SCMP_SYS(socket),
1450 0);
1451 if (r < 0) {
1452 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1453 continue;
1454 }
1455
1456 } else {
1457
1458 /* Block everything below the first entry */
1459 r = seccomp_rule_add_exact(
1460 seccomp,
1461 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1462 SCMP_SYS(socket),
1463 1,
1464 SCMP_A0(SCMP_CMP_LT, first));
1465 if (r < 0) {
1466 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1467 continue;
1468 }
1469
1470 /* Block everything above the last entry */
1471 r = seccomp_rule_add_exact(
1472 seccomp,
1473 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1474 SCMP_SYS(socket),
1475 1,
1476 SCMP_A0(SCMP_CMP_GT, last));
1477 if (r < 0) {
1478 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1479 continue;
1480 }
1481
1482 /* Block everything between the first and last entry */
077e8fc0 1483 for (int af = 1; af < af_max(); af++) {
469830d1
LP
1484
1485 if (set_contains(address_families, INT_TO_PTR(af)))
1486 continue;
1487
1488 r = seccomp_rule_add_exact(
1489 seccomp,
1490 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1491 SCMP_SYS(socket),
1492 1,
1493 SCMP_A0(SCMP_CMP_EQ, af));
1494 if (r < 0)
1495 break;
1496 }
469830d1
LP
1497 if (r < 0) {
1498 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1499 continue;
1500 }
1501 }
1502
1503 } else {
1504 void *af;
1505
6b000af4
LP
1506 /* If this is a deny list, then generate one rule for each address family that are
1507 * then combined in OR checks. */
469830d1 1508
90e74a66 1509 SET_FOREACH(af, address_families) {
469830d1
LP
1510 r = seccomp_rule_add_exact(
1511 seccomp,
1512 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1513 SCMP_SYS(socket),
1514 1,
1515 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1516 if (r < 0)
1517 break;
1518 }
469830d1
LP
1519 if (r < 0) {
1520 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1521 continue;
1522 }
1523 }
1524
1525 r = seccomp_load(seccomp);
7bc5e0b1 1526 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1527 return r;
1528 if (r < 0)
1529 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1530 }
1531
1532 return 0;
1533}
1534
1535int seccomp_restrict_realtime(void) {
1536 static const int permitted_policies[] = {
1537 SCHED_OTHER,
1538 SCHED_BATCH,
1539 SCHED_IDLE,
1540 };
1541
1542 int r, max_policy = 0;
1543 uint32_t arch;
1544 unsigned i;
1545
1546 /* Determine the highest policy constant we want to allow */
1547 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1548 if (permitted_policies[i] > max_policy)
1549 max_policy = permitted_policies[i];
1550
1551 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1552 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1553 int p;
1554
1555 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1556
1557 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1558 if (r < 0)
1559 return r;
1560
1561 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1562 * allow list. */
469830d1
LP
1563 for (p = 0; p < max_policy; p++) {
1564 bool good = false;
1565
6b000af4 1566 /* Check if this is in the allow list. */
469830d1
LP
1567 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1568 if (permitted_policies[i] == p) {
1569 good = true;
1570 break;
1571 }
1572
1573 if (good)
1574 continue;
1575
1576 /* Deny this policy */
1577 r = seccomp_rule_add_exact(
1578 seccomp,
1579 SCMP_ACT_ERRNO(EPERM),
1580 SCMP_SYS(sched_setscheduler),
1581 1,
1582 SCMP_A1(SCMP_CMP_EQ, p));
1583 if (r < 0) {
1584 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1585 continue;
1586 }
1587 }
1588
6b000af4
LP
1589 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1590 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1591 r = seccomp_rule_add_exact(
add00535
LP
1592 seccomp,
1593 SCMP_ACT_ERRNO(EPERM),
469830d1 1594 SCMP_SYS(sched_setscheduler),
add00535 1595 1,
469830d1
LP
1596 SCMP_A1(SCMP_CMP_GT, max_policy));
1597 if (r < 0) {
1598 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1599 continue;
1600 }
add00535 1601
469830d1 1602 r = seccomp_load(seccomp);
7bc5e0b1 1603 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1604 return r;
1605 if (r < 0)
1606 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1607 }
1608
1609 return 0;
1610}
1611
6dc66688
ZJS
1612static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1613 uint32_t arch,
1614 int nr,
14cb109d 1615 unsigned arg_cnt,
6dc66688
ZJS
1616 const struct scmp_arg_cmp arg) {
1617 int r;
1618
1619 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1620 if (r < 0) {
1621 _cleanup_free_ char *n = NULL;
1622
1623 n = seccomp_syscall_resolve_num_arch(arch, nr);
1624 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1625 strna(n),
1626 seccomp_arch_to_string(arch));
1627 }
1628
1629 return r;
1630}
1631
2a8d6e63 1632/* For known architectures, check that syscalls are indeed defined or not. */
f9252236 1633#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
2a8d6e63
ZJS
1634assert_cc(SCMP_SYS(shmget) > 0);
1635assert_cc(SCMP_SYS(shmat) > 0);
1636assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1637#endif
6dc66688 1638
469830d1
LP
1639int seccomp_memory_deny_write_execute(void) {
1640 uint32_t arch;
b069c2a3 1641 unsigned loaded = 0;
469830d1
LP
1642
1643 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1644 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1645 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1646
469830d1
LP
1647 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1648
8a50cf69
LP
1649 switch (arch) {
1650
bed4668d
CE
1651 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1652 * We ignore that here, which means there's still a way to get writable/executable
1653 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1654
8a50cf69 1655 case SCMP_ARCH_X86:
57311925 1656 case SCMP_ARCH_S390:
8a50cf69
LP
1657 filter_syscall = SCMP_SYS(mmap2);
1658 block_syscall = SCMP_SYS(mmap);
bed4668d 1659 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1660 break;
1661
63d00dfb 1662 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1663 case SCMP_ARCH_PPC64:
1664 case SCMP_ARCH_PPC64LE:
bed4668d 1665 case SCMP_ARCH_S390X:
2a8d6e63 1666 filter_syscall = SCMP_SYS(mmap);
bed4668d 1667 /* shmat multiplexed, see above */
8a50cf69
LP
1668 break;
1669
4278d1f5
ZJS
1670 case SCMP_ARCH_ARM:
1671 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1672 shmat_syscall = SCMP_SYS(shmat);
1673 break;
1674
8a50cf69
LP
1675 case SCMP_ARCH_X86_64:
1676 case SCMP_ARCH_X32:
79873bc8 1677 case SCMP_ARCH_AARCH64:
f9252236
AJ
1678#ifdef SCMP_ARCH_RISCV64
1679 case SCMP_ARCH_RISCV64:
1680#endif
1681 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
8a50cf69
LP
1682 shmat_syscall = SCMP_SYS(shmat);
1683 break;
1684
1685 /* Please add more definitions here, if you port systemd to other architectures! */
1686
f9252236 1687#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
8a50cf69
LP
1688#warning "Consider adding the right mmap() syscall definitions here!"
1689#endif
1690 }
1691
1692 /* Can't filter mmap() on this arch, then skip it */
1693 if (filter_syscall == 0)
1694 continue;
1695
469830d1
LP
1696 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1697 if (r < 0)
1698 return r;
1699
6dc66688
ZJS
1700 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1701 1,
1702 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1703 if (r < 0)
1704 continue;
8a50cf69
LP
1705
1706 if (block_syscall != 0) {
6dc66688
ZJS
1707 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1708 if (r < 0)
8a50cf69 1709 continue;
add00535 1710 }
a3be2849 1711
6dc66688
ZJS
1712 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1713 1,
b835eeb4
ZJS
1714 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1715 if (r < 0)
1716 continue;
1717
91691f1d 1718#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1719 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1720 1,
6dc66688
ZJS
1721 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1722 if (r < 0)
469830d1 1723 continue;
91691f1d 1724#endif
add00535 1725
67fb5f33 1726 if (shmat_syscall > 0) {
5ef3ed97 1727 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1728 1,
1729 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1730 if (r < 0)
8a50cf69 1731 continue;
469830d1
LP
1732 }
1733
1734 r = seccomp_load(seccomp);
7bc5e0b1 1735 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1 1736 return r;
add00535 1737 if (r < 0)
b069c2a3
ZJS
1738 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1739 seccomp_arch_to_string(arch));
903659e7 1740 loaded++;
469830d1 1741 }
add00535 1742
903659e7 1743 if (loaded == 0)
b069c2a3 1744 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1745
1746 return loaded;
469830d1
LP
1747}
1748
1749int seccomp_restrict_archs(Set *archs) {
1750 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
469830d1
LP
1751 void *id;
1752 int r;
1753
1754 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1755 * list.
1756 *
1757 * There are some qualifications. However the most important use is to stop processes from bypassing
1758 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1759 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1760
2428aaf8
AJ
1761 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1762 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1763 * to run a program with the restrictions applied. */
469830d1
LP
1764 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1765 if (!seccomp)
1766 return -ENOMEM;
1767
90e74a66 1768 SET_FOREACH(id, archs) {
469830d1 1769 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1770 if (r < 0 && r != -EEXIST)
1771 return r;
1772 }
1773
1774 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1775 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1776 * The important thing is that you can block the old 32-bit x86 syscalls.
1777 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1778
1779 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1780 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1781
1782 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1783 if (r < 0 && r != -EEXIST)
469830d1 1784 return r;
add00535
LP
1785 }
1786
469830d1
LP
1787 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1788 if (r < 0)
1789 return r;
add00535 1790
1c6af69b 1791 r = seccomp_load(seccomp);
7bc5e0b1 1792 if (ERRNO_IS_SECCOMP_FATAL(r))
1c6af69b
LP
1793 return r;
1794 if (r < 0)
1795 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1796
1797 return 0;
a3be2849 1798}
b16bd535 1799
de7fef4b
ZJS
1800int parse_syscall_archs(char **l, Set **ret_archs) {
1801 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
1802 char **s;
1803 int r;
1804
1805 assert(l);
de7fef4b 1806 assert(ret_archs);
b16bd535
YW
1807
1808 STRV_FOREACH(s, l) {
1809 uint32_t a;
1810
1811 r = seccomp_arch_from_string(*s, &a);
1812 if (r < 0)
1813 return -EINVAL;
1814
de7fef4b 1815 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
1816 if (r < 0)
1817 return -ENOMEM;
1818 }
1819
de7fef4b 1820 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
1821 return 0;
1822}
165a31c0 1823
8cfa775f 1824int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1825 const char *i;
1826 int r;
1827
1828 assert(set);
1829
1830 NULSTR_FOREACH(i, set->value) {
1831
1832 if (i[0] == '@') {
1833 const SyscallFilterSet *more;
1834
1835 more = syscall_filter_set_find(i);
1836 if (!more)
1837 return -ENXIO;
1838
165a31c0
LP
1839 r = seccomp_filter_set_add(filter, add, more);
1840 if (r < 0)
1841 return r;
1842 } else {
1843 int id;
1844
1845 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1846 if (id == __NR_SCMP_ERROR) {
1847 log_debug("Couldn't resolve system call, ignoring: %s", i);
1848 continue;
1849 }
165a31c0
LP
1850
1851 if (add) {
8cfa775f 1852 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1853 if (r < 0)
1854 return r;
1855 } else
8cfa775f 1856 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1857 }
1858 }
1859
1860 return 0;
1861}
78e864e5
TM
1862
1863int seccomp_lock_personality(unsigned long personality) {
72eafe71 1864 uint32_t arch;
78e864e5
TM
1865 int r;
1866
72eafe71
LP
1867 if (personality >= PERSONALITY_INVALID)
1868 return -EINVAL;
78e864e5 1869
72eafe71
LP
1870 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1871 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1872
72eafe71
LP
1873 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1874 if (r < 0)
1875 return r;
1876
1877 r = seccomp_rule_add_exact(
1878 seccomp,
1879 SCMP_ACT_ERRNO(EPERM),
1880 SCMP_SYS(personality),
1881 1,
1882 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1883 if (r < 0) {
1884 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1885 continue;
1886 }
72eafe71
LP
1887
1888 r = seccomp_load(seccomp);
7bc5e0b1 1889 if (ERRNO_IS_SECCOMP_FATAL(r))
72eafe71
LP
1890 return r;
1891 if (r < 0)
1892 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1893 }
1894
1895 return 0;
78e864e5 1896}
aecd5ac6
TM
1897
1898int seccomp_protect_hostname(void) {
1899 uint32_t arch;
1900 int r;
1901
1902 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1903 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1904
1905 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1906 if (r < 0)
1907 return r;
1908
1909 r = seccomp_rule_add_exact(
1910 seccomp,
1911 SCMP_ACT_ERRNO(EPERM),
1912 SCMP_SYS(sethostname),
1913 0);
9e6e543c
LP
1914 if (r < 0) {
1915 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1916 continue;
9e6e543c 1917 }
aecd5ac6
TM
1918
1919 r = seccomp_rule_add_exact(
1920 seccomp,
1921 SCMP_ACT_ERRNO(EPERM),
1922 SCMP_SYS(setdomainname),
1923 0);
9e6e543c
LP
1924 if (r < 0) {
1925 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1926 continue;
9e6e543c 1927 }
aecd5ac6
TM
1928
1929 r = seccomp_load(seccomp);
7bc5e0b1 1930 if (ERRNO_IS_SECCOMP_FATAL(r))
aecd5ac6
TM
1931 return r;
1932 if (r < 0)
1933 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1934 }
1935
1936 return 0;
1937}
3c27973b 1938
da4dc9a6
ZJS
1939static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1940 /* Checks the mode_t parameter of the following system calls:
1941 *
1942 * → chmod() + fchmod() + fchmodat()
1943 * → open() + creat() + openat()
1944 * → mkdir() + mkdirat()
1945 * → mknod() + mknodat()
1946 *
1947 * Returns error if *everything* failed, and 0 otherwise.
1948 */
6d95e7d9 1949 int r;
da4dc9a6
ZJS
1950 bool any = false;
1951
1952 r = seccomp_rule_add_exact(
1953 seccomp,
1954 SCMP_ACT_ERRNO(EPERM),
1955 SCMP_SYS(chmod),
1956 1,
1957 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1958 if (r < 0)
1959 log_debug_errno(r, "Failed to add filter for chmod: %m");
1960 else
1961 any = true;
1962
1963 r = seccomp_rule_add_exact(
1964 seccomp,
1965 SCMP_ACT_ERRNO(EPERM),
1966 SCMP_SYS(fchmod),
1967 1,
1968 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1969 if (r < 0)
1970 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1971 else
1972 any = true;
1973
1974 r = seccomp_rule_add_exact(
1975 seccomp,
1976 SCMP_ACT_ERRNO(EPERM),
1977 SCMP_SYS(fchmodat),
1978 1,
1979 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1980 if (r < 0)
1981 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1982 else
1983 any = true;
1984
1985 r = seccomp_rule_add_exact(
1986 seccomp,
1987 SCMP_ACT_ERRNO(EPERM),
1988 SCMP_SYS(mkdir),
1989 1,
1990 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1991 if (r < 0)
1992 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1993 else
1994 any = true;
1995
1996 r = seccomp_rule_add_exact(
1997 seccomp,
1998 SCMP_ACT_ERRNO(EPERM),
1999 SCMP_SYS(mkdirat),
2000 1,
2001 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2002 if (r < 0)
2003 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2004 else
2005 any = true;
2006
2007 r = seccomp_rule_add_exact(
2008 seccomp,
2009 SCMP_ACT_ERRNO(EPERM),
2010 SCMP_SYS(mknod),
2011 1,
2012 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2013 if (r < 0)
2014 log_debug_errno(r, "Failed to add filter for mknod: %m");
2015 else
2016 any = true;
2017
2018 r = seccomp_rule_add_exact(
2019 seccomp,
2020 SCMP_ACT_ERRNO(EPERM),
2021 SCMP_SYS(mknodat),
2022 1,
2023 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2024 if (r < 0)
2025 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2026 else
2027 any = true;
2028
2029#if SCMP_SYS(open) > 0
2030 r = seccomp_rule_add_exact(
2031 seccomp,
2032 SCMP_ACT_ERRNO(EPERM),
2033 SCMP_SYS(open),
2034 2,
2035 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2036 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2037 if (r < 0)
2038 log_debug_errno(r, "Failed to add filter for open: %m");
2039 else
2040 any = true;
2041#endif
2042
2043 r = seccomp_rule_add_exact(
2044 seccomp,
2045 SCMP_ACT_ERRNO(EPERM),
2046 SCMP_SYS(openat),
2047 2,
2048 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2049 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2050 if (r < 0)
2051 log_debug_errno(r, "Failed to add filter for openat: %m");
2052 else
2053 any = true;
2054
ecc04067
LP
2055#if defined(__SNR_openat2)
2056 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2057 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2058 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2059 * compatible with kernels that are not absolutely recent. */
2060 r = seccomp_rule_add_exact(
2061 seccomp,
2062 SCMP_ACT_ERRNO(EPERM),
2063 SCMP_SYS(openat2),
2064 0);
2065 if (r < 0)
2066 log_debug_errno(r, "Failed to add filter for openat2: %m");
2067 else
2068 any = true;
2069#endif
2070
da4dc9a6
ZJS
2071 r = seccomp_rule_add_exact(
2072 seccomp,
2073 SCMP_ACT_ERRNO(EPERM),
2074 SCMP_SYS(creat),
2075 1,
2076 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2077 if (r < 0)
2078 log_debug_errno(r, "Failed to add filter for creat: %m");
2079 else
2080 any = true;
2081
2082 return any ? 0 : r;
2083}
2084
3c27973b
LP
2085int seccomp_restrict_suid_sgid(void) {
2086 uint32_t arch;
da4dc9a6 2087 int r, k;
3c27973b
LP
2088
2089 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2090 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2091
2092 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2093 if (r < 0)
2094 return r;
2095
da4dc9a6
ZJS
2096 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2097 if (r < 0)
2098 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2099
da4dc9a6
ZJS
2100 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2101 if (k < 0)
2102 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2103
da4dc9a6 2104 if (r < 0 && k < 0)
3c27973b 2105 continue;
3c27973b
LP
2106
2107 r = seccomp_load(seccomp);
7bc5e0b1 2108 if (ERRNO_IS_SECCOMP_FATAL(r))
3c27973b
LP
2109 return r;
2110 if (r < 0)
2111 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2112 }
2113
2114 return 0;
2115}
915fb324
LP
2116
2117uint32_t scmp_act_kill_process(void) {
2118
2119 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2120 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2121 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2122 * for single-threaded apps does the right thing. */
2123
2124#ifdef SCMP_ACT_KILL_PROCESS
2125 if (seccomp_api_get() >= 3)
2126 return SCMP_ACT_KILL_PROCESS;
2127#endif
2128
2129 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2130}