]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
Merge pull request #16686 from bluca/mount_images_opts
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
3c27973b 4#include <fcntl.h>
469830d1 5#include <linux/seccomp.h>
57183d11 6#include <seccomp.h>
a8fbdf54 7#include <stddef.h>
469830d1 8#include <sys/mman.h>
d347d902 9#include <sys/prctl.h>
469830d1 10#include <sys/shm.h>
3c27973b 11#include <sys/stat.h>
57183d11 12
469830d1 13#include "af-list.h"
add00535 14#include "alloc-util.h"
d8b4d14d 15#include "errno-list.h"
a8fbdf54 16#include "macro.h"
add00535 17#include "nsflags.h"
d8b4d14d 18#include "nulstr-util.h"
78e864e5 19#include "process-util.h"
cf0fbc49 20#include "seccomp-util.h"
b16bd535 21#include "set.h"
07630cea 22#include "string-util.h"
b16bd535 23#include "strv.h"
469830d1
LP
24
25const uint32_t seccomp_local_archs[] = {
26
6b000af4 27 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
f2d9751c
LP
28
29#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
f2d9751c
LP
32 SCMP_ARCH_X32, /* native */
33#elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
469830d1 35 SCMP_ARCH_X32,
f2d9751c
LP
36 SCMP_ARCH_X86_64, /* native */
37#elif defined(__i386__)
38 SCMP_ARCH_X86,
39#elif defined(__aarch64__)
469830d1 40 SCMP_ARCH_ARM,
f2d9751c
LP
41 SCMP_ARCH_AARCH64, /* native */
42#elif defined(__arm__)
43 SCMP_ARCH_ARM,
44#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 48 SCMP_ARCH_MIPS,
f2d9751c
LP
49 SCMP_ARCH_MIPSEL, /* native */
50#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
469830d1 54 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
469830d1 59 SCMP_ARCH_MIPSEL,
f2d9751c
LP
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
469830d1 67 SCMP_ARCH_MIPSEL64,
f2d9751c 68 SCMP_ARCH_MIPS64,
469830d1 69 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
70 SCMP_ARCH_MIPS64N32, /* native */
71#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 79 SCMP_ARCH_PPC,
469830d1 80 SCMP_ARCH_PPC64LE,
f2d9751c
LP
81 SCMP_ARCH_PPC64, /* native */
82#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86#elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
f9252236
AJ
88#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
89 SCMP_ARCH_RISCV64,
f2d9751c
LP
90#elif defined(__s390x__)
91 SCMP_ARCH_S390,
92 SCMP_ARCH_S390X, /* native */
93#elif defined(__s390__)
469830d1 94 SCMP_ARCH_S390,
469830d1
LP
95#endif
96 (uint32_t) -1
97 };
57183d11
LP
98
99const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
100 /* Maintain order used in <seccomp.h>.
101 *
102 * Names used here should be the same as those used for ConditionArchitecture=,
103 * except for "subarchitectures" like x32. */
57183d11 104
aa34055f
ZJS
105 switch(c) {
106 case SCMP_ARCH_NATIVE:
57183d11 107 return "native";
aa34055f 108 case SCMP_ARCH_X86:
57183d11 109 return "x86";
aa34055f 110 case SCMP_ARCH_X86_64:
57183d11 111 return "x86-64";
aa34055f 112 case SCMP_ARCH_X32:
57183d11 113 return "x32";
aa34055f 114 case SCMP_ARCH_ARM:
57183d11 115 return "arm";
aa34055f
ZJS
116 case SCMP_ARCH_AARCH64:
117 return "arm64";
118 case SCMP_ARCH_MIPS:
119 return "mips";
120 case SCMP_ARCH_MIPS64:
121 return "mips64";
122 case SCMP_ARCH_MIPS64N32:
123 return "mips64-n32";
124 case SCMP_ARCH_MIPSEL:
125 return "mips-le";
126 case SCMP_ARCH_MIPSEL64:
127 return "mips64-le";
128 case SCMP_ARCH_MIPSEL64N32:
129 return "mips64-le-n32";
130 case SCMP_ARCH_PPC:
131 return "ppc";
132 case SCMP_ARCH_PPC64:
133 return "ppc64";
134 case SCMP_ARCH_PPC64LE:
135 return "ppc64-le";
f9252236
AJ
136#ifdef SCMP_ARCH_RISCV64
137 case SCMP_ARCH_RISCV64:
138 return "riscv64";
139#endif
aa34055f 140 case SCMP_ARCH_S390:
6abfd303 141 return "s390";
aa34055f 142 case SCMP_ARCH_S390X:
6abfd303 143 return "s390x";
aa34055f
ZJS
144 default:
145 return NULL;
146 }
57183d11
LP
147}
148
149int seccomp_arch_from_string(const char *n, uint32_t *ret) {
150 if (!n)
151 return -EINVAL;
152
153 assert(ret);
154
155 if (streq(n, "native"))
156 *ret = SCMP_ARCH_NATIVE;
157 else if (streq(n, "x86"))
158 *ret = SCMP_ARCH_X86;
159 else if (streq(n, "x86-64"))
160 *ret = SCMP_ARCH_X86_64;
161 else if (streq(n, "x32"))
162 *ret = SCMP_ARCH_X32;
163 else if (streq(n, "arm"))
164 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
165 else if (streq(n, "arm64"))
166 *ret = SCMP_ARCH_AARCH64;
167 else if (streq(n, "mips"))
168 *ret = SCMP_ARCH_MIPS;
169 else if (streq(n, "mips64"))
170 *ret = SCMP_ARCH_MIPS64;
171 else if (streq(n, "mips64-n32"))
172 *ret = SCMP_ARCH_MIPS64N32;
173 else if (streq(n, "mips-le"))
174 *ret = SCMP_ARCH_MIPSEL;
175 else if (streq(n, "mips64-le"))
176 *ret = SCMP_ARCH_MIPSEL64;
177 else if (streq(n, "mips64-le-n32"))
178 *ret = SCMP_ARCH_MIPSEL64N32;
179 else if (streq(n, "ppc"))
180 *ret = SCMP_ARCH_PPC;
181 else if (streq(n, "ppc64"))
182 *ret = SCMP_ARCH_PPC64;
183 else if (streq(n, "ppc64-le"))
184 *ret = SCMP_ARCH_PPC64LE;
f9252236
AJ
185#ifdef SCMP_ARCH_RISCV64
186 else if (streq(n, "riscv64"))
187 *ret = SCMP_ARCH_RISCV64;
188#endif
6abfd303
HB
189 else if (streq(n, "s390"))
190 *ret = SCMP_ARCH_S390;
191 else if (streq(n, "s390x"))
192 *ret = SCMP_ARCH_S390X;
57183d11
LP
193 else
194 return -EINVAL;
195
196 return 0;
197}
e9642be2 198
469830d1 199int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
b4eaa6cc 200 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8d7b0c8f
LP
201 int r;
202
469830d1
LP
203 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
204 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
205
206 seccomp = seccomp_init(default_action);
207 if (!seccomp)
208 return -ENOMEM;
209
469830d1
LP
210 if (arch != SCMP_ARCH_NATIVE &&
211 arch != seccomp_arch_native()) {
212
1b52793d 213 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1 214 if (r < 0)
b4eaa6cc 215 return r;
469830d1 216
1b52793d 217 r = seccomp_arch_add(seccomp, arch);
469830d1 218 if (r < 0)
b4eaa6cc 219 return r;
469830d1
LP
220
221 assert(seccomp_arch_exist(seccomp, arch) >= 0);
222 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
223 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
224 } else {
225 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
226 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
227 }
228
229 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f 230 if (r < 0)
b4eaa6cc 231 return r;
8d7b0c8f
LP
232
233 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
234 if (r < 0)
b4eaa6cc 235 return r;
8d7b0c8f 236
b4eaa6cc 237 *ret = TAKE_PTR(seccomp);
8d7b0c8f 238 return 0;
8d7b0c8f
LP
239}
240
d347d902 241static bool is_basic_seccomp_available(void) {
4d5bd50a 242 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
243}
244
245static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
246 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
247 errno == EFAULT;
d347d902
FS
248}
249
83f12b27 250bool is_seccomp_available(void) {
83f12b27 251 static int cached_enabled = -1;
4d5bd50a 252
83f12b27 253 if (cached_enabled < 0)
4d5bd50a
LP
254 cached_enabled =
255 is_basic_seccomp_available() &&
256 is_seccomp_filter_available();
257
83f12b27
FS
258 return cached_enabled;
259}
260
8130926d 261const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 262 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 263 .name = "@default",
d5efc18b 264 .help = "System calls that are always permitted",
40eb6a80
ZJS
265 .value =
266 "clock_getres\0"
6ca67710 267 "clock_getres_time64\0"
40eb6a80 268 "clock_gettime\0"
6ca67710 269 "clock_gettime64\0"
40eb6a80 270 "clock_nanosleep\0"
6ca67710 271 "clock_nanosleep_time64\0"
40eb6a80
ZJS
272 "execve\0"
273 "exit\0"
274 "exit_group\0"
e41b0f42 275 "futex\0"
6ca67710 276 "futex_time64\0"
e41b0f42
LP
277 "get_robust_list\0"
278 "get_thread_area\0"
09d3020b
DH
279 "getegid\0"
280 "getegid32\0"
281 "geteuid\0"
282 "geteuid32\0"
283 "getgid\0"
284 "getgid32\0"
285 "getgroups\0"
286 "getgroups32\0"
287 "getpgid\0"
288 "getpgrp\0"
289 "getpid\0"
290 "getppid\0"
291 "getresgid\0"
292 "getresgid32\0"
293 "getresuid\0"
294 "getresuid32\0"
40eb6a80 295 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
296 "getsid\0"
297 "gettid\0"
40eb6a80 298 "gettimeofday\0"
09d3020b
DH
299 "getuid\0"
300 "getuid32\0"
e41b0f42 301 "membarrier\0"
40eb6a80
ZJS
302 "nanosleep\0"
303 "pause\0"
4c3a9176 304 "prlimit64\0"
e41b0f42 305 "restart_syscall\0"
6fee3be0 306 "rseq\0"
40eb6a80 307 "rt_sigreturn\0"
8f44de08 308 "sched_yield\0"
e41b0f42
LP
309 "set_robust_list\0"
310 "set_thread_area\0"
311 "set_tid_address\0"
ce5faeac 312 "set_tls\0"
40eb6a80
ZJS
313 "sigreturn\0"
314 "time\0"
4c3a9176 315 "ugetrlimit\0"
40eb6a80 316 },
44898c53
LP
317 [SYSCALL_FILTER_SET_AIO] = {
318 .name = "@aio",
319 .help = "Asynchronous IO",
320 .value =
321 "io_cancel\0"
322 "io_destroy\0"
323 "io_getevents\0"
a05cfe23 324 "io_pgetevents\0"
6ca67710 325 "io_pgetevents_time64\0"
44898c53
LP
326 "io_setup\0"
327 "io_submit\0"
9e486265
LP
328 "io_uring_enter\0"
329 "io_uring_register\0"
330 "io_uring_setup\0"
44898c53 331 },
133ddbbe 332 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 333 .name = "@basic-io",
d5efc18b 334 .help = "Basic IO",
133ddbbe 335 .value =
648a0ed0 336 "_llseek\0"
133ddbbe 337 "close\0"
648a0ed0 338 "dup\0"
133ddbbe
LP
339 "dup2\0"
340 "dup3\0"
133ddbbe
LP
341 "lseek\0"
342 "pread64\0"
343 "preadv\0"
44898c53 344 "preadv2\0"
133ddbbe
LP
345 "pwrite64\0"
346 "pwritev\0"
44898c53 347 "pwritev2\0"
133ddbbe
LP
348 "read\0"
349 "readv\0"
350 "write\0"
351 "writev\0"
352 },
44898c53
LP
353 [SYSCALL_FILTER_SET_CHOWN] = {
354 .name = "@chown",
355 .help = "Change ownership of files and directories",
356 .value =
357 "chown\0"
358 "chown32\0"
359 "fchown\0"
360 "fchown32\0"
361 "fchownat\0"
362 "lchown\0"
363 "lchown32\0"
364 },
8130926d 365 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 366 .name = "@clock",
d5efc18b 367 .help = "Change the system time",
201c1cc2
TM
368 .value =
369 "adjtimex\0"
1f9ac68b 370 "clock_adjtime\0"
6ca67710 371 "clock_adjtime64\0"
1f9ac68b 372 "clock_settime\0"
6ca67710 373 "clock_settime64\0"
201c1cc2 374 "settimeofday\0"
1f9ac68b 375 "stime\0"
8130926d
LP
376 },
377 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 378 .name = "@cpu-emulation",
d5efc18b 379 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
380 .value =
381 "modify_ldt\0"
382 "subpage_prot\0"
383 "switch_endian\0"
384 "vm86\0"
385 "vm86old\0"
8130926d
LP
386 },
387 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 388 .name = "@debug",
d5efc18b 389 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
390 .value =
391 "lookup_dcookie\0"
392 "perf_event_open\0"
8270e3d8 393 "pidfd_getfd\0"
1f9ac68b
LP
394 "ptrace\0"
395 "rtas\0"
6da432fd 396#if defined __s390__ || defined __s390x__
1f9ac68b 397 "s390_runtime_instr\0"
8130926d 398#endif
1f9ac68b 399 "sys_debug_setcontext\0"
8130926d 400 },
1a1b13c9
LP
401 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
402 .name = "@file-system",
403 .help = "File system operations",
404 .value =
405 "access\0"
406 "chdir\0"
407 "chmod\0"
408 "close\0"
409 "creat\0"
410 "faccessat\0"
bcf08acb 411 "faccessat2\0"
1a1b13c9
LP
412 "fallocate\0"
413 "fchdir\0"
414 "fchmod\0"
415 "fchmodat\0"
1a1b13c9 416 "fcntl\0"
ceaa6aa7 417 "fcntl64\0"
1a1b13c9
LP
418 "fgetxattr\0"
419 "flistxattr\0"
ceaa6aa7 420 "fremovexattr\0"
1a1b13c9 421 "fsetxattr\0"
1a1b13c9 422 "fstat\0"
ceaa6aa7 423 "fstat64\0"
1a1b13c9 424 "fstatat64\0"
1a1b13c9 425 "fstatfs\0"
ceaa6aa7 426 "fstatfs64\0"
1a1b13c9 427 "ftruncate\0"
ceaa6aa7 428 "ftruncate64\0"
1a1b13c9
LP
429 "futimesat\0"
430 "getcwd\0"
1a1b13c9 431 "getdents\0"
ceaa6aa7 432 "getdents64\0"
1a1b13c9
LP
433 "getxattr\0"
434 "inotify_add_watch\0"
ceaa6aa7 435 "inotify_init\0"
1a1b13c9
LP
436 "inotify_init1\0"
437 "inotify_rm_watch\0"
438 "lgetxattr\0"
439 "link\0"
440 "linkat\0"
441 "listxattr\0"
442 "llistxattr\0"
443 "lremovexattr\0"
444 "lsetxattr\0"
1a1b13c9 445 "lstat\0"
ceaa6aa7 446 "lstat64\0"
1a1b13c9
LP
447 "mkdir\0"
448 "mkdirat\0"
449 "mknod\0"
450 "mknodat\0"
1a1b13c9 451 "mmap\0"
ceaa6aa7 452 "mmap2\0"
7961116e 453 "munmap\0"
1a1b13c9 454 "newfstatat\0"
ceaa6aa7
LP
455 "oldfstat\0"
456 "oldlstat\0"
457 "oldstat\0"
1a1b13c9
LP
458 "open\0"
459 "openat\0"
8270e3d8 460 "openat2\0"
1a1b13c9
LP
461 "readlink\0"
462 "readlinkat\0"
463 "removexattr\0"
464 "rename\0"
1a1b13c9 465 "renameat\0"
ceaa6aa7 466 "renameat2\0"
1a1b13c9
LP
467 "rmdir\0"
468 "setxattr\0"
1a1b13c9 469 "stat\0"
ceaa6aa7 470 "stat64\0"
1a1b13c9 471 "statfs\0"
ceaa6aa7 472 "statfs64\0"
a4135a74 473 "statx\0"
1a1b13c9
LP
474 "symlink\0"
475 "symlinkat\0"
1a1b13c9 476 "truncate\0"
ceaa6aa7 477 "truncate64\0"
1a1b13c9
LP
478 "unlink\0"
479 "unlinkat\0"
ceaa6aa7 480 "utime\0"
1a1b13c9 481 "utimensat\0"
6ca67710 482 "utimensat_time64\0"
1a1b13c9
LP
483 "utimes\0"
484 },
8130926d 485 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 486 .name = "@io-event",
d5efc18b 487 .help = "Event loop system calls",
201c1cc2
TM
488 .value =
489 "_newselect\0"
201c1cc2 490 "epoll_create\0"
215728ff 491 "epoll_create1\0"
201c1cc2
TM
492 "epoll_ctl\0"
493 "epoll_ctl_old\0"
494 "epoll_pwait\0"
495 "epoll_wait\0"
496 "epoll_wait_old\0"
201c1cc2 497 "eventfd\0"
215728ff 498 "eventfd2\0"
201c1cc2
TM
499 "poll\0"
500 "ppoll\0"
6ca67710 501 "ppoll_time64\0"
201c1cc2 502 "pselect6\0"
6ca67710 503 "pselect6_time64\0"
201c1cc2 504 "select\0"
8130926d
LP
505 },
506 [SYSCALL_FILTER_SET_IPC] = {
8130926d 507 .name = "@ipc",
d5efc18b
ZJS
508 .help = "SysV IPC, POSIX Message Queues or other IPC",
509 .value =
510 "ipc\0"
cd5bfd7e 511 "memfd_create\0"
201c1cc2
TM
512 "mq_getsetattr\0"
513 "mq_notify\0"
514 "mq_open\0"
515 "mq_timedreceive\0"
6ca67710 516 "mq_timedreceive_time64\0"
201c1cc2 517 "mq_timedsend\0"
6ca67710 518 "mq_timedsend_time64\0"
201c1cc2
TM
519 "mq_unlink\0"
520 "msgctl\0"
521 "msgget\0"
522 "msgrcv\0"
523 "msgsnd\0"
cd5bfd7e 524 "pipe\0"
215728ff 525 "pipe2\0"
201c1cc2
TM
526 "process_vm_readv\0"
527 "process_vm_writev\0"
528 "semctl\0"
529 "semget\0"
530 "semop\0"
531 "semtimedop\0"
6ca67710 532 "semtimedop_time64\0"
201c1cc2
TM
533 "shmat\0"
534 "shmctl\0"
535 "shmdt\0"
536 "shmget\0"
8130926d
LP
537 },
538 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 539 .name = "@keyring",
d5efc18b 540 .help = "Kernel keyring access",
1f9ac68b
LP
541 .value =
542 "add_key\0"
543 "keyctl\0"
544 "request_key\0"
8130926d 545 },
cd0ddf6f
LP
546 [SYSCALL_FILTER_SET_MEMLOCK] = {
547 .name = "@memlock",
548 .help = "Memory locking control",
549 .value =
550 "mlock\0"
551 "mlock2\0"
552 "mlockall\0"
553 "munlock\0"
554 "munlockall\0"
555 },
8130926d 556 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 557 .name = "@module",
d5efc18b 558 .help = "Loading and unloading of kernel modules",
201c1cc2 559 .value =
201c1cc2
TM
560 "delete_module\0"
561 "finit_module\0"
562 "init_module\0"
8130926d
LP
563 },
564 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 565 .name = "@mount",
d5efc18b 566 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
567 .value =
568 "chroot\0"
9e486265
LP
569 "fsconfig\0"
570 "fsmount\0"
571 "fsopen\0"
572 "fspick\0"
201c1cc2 573 "mount\0"
9e486265
LP
574 "move_mount\0"
575 "open_tree\0"
201c1cc2 576 "pivot_root\0"
201c1cc2 577 "umount\0"
215728ff 578 "umount2\0"
8130926d
LP
579 },
580 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 581 .name = "@network-io",
d5efc18b 582 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 583 .value =
201c1cc2 584 "accept\0"
215728ff 585 "accept4\0"
201c1cc2
TM
586 "bind\0"
587 "connect\0"
588 "getpeername\0"
589 "getsockname\0"
590 "getsockopt\0"
591 "listen\0"
592 "recv\0"
593 "recvfrom\0"
594 "recvmmsg\0"
6ca67710 595 "recvmmsg_time64\0"
201c1cc2
TM
596 "recvmsg\0"
597 "send\0"
598 "sendmmsg\0"
599 "sendmsg\0"
600 "sendto\0"
601 "setsockopt\0"
602 "shutdown\0"
603 "socket\0"
604 "socketcall\0"
605 "socketpair\0"
8130926d
LP
606 },
607 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 608 /* some unknown even to libseccomp */
8130926d 609 .name = "@obsolete",
d5efc18b 610 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
611 .value =
612 "_sysctl\0"
613 "afs_syscall\0"
802fa07a 614 "bdflush\0"
201c1cc2 615 "break\0"
1f9ac68b 616 "create_module\0"
201c1cc2
TM
617 "ftime\0"
618 "get_kernel_syms\0"
201c1cc2
TM
619 "getpmsg\0"
620 "gtty\0"
7e0c3b8f 621 "idle\0"
201c1cc2 622 "lock\0"
201c1cc2 623 "mpx\0"
201c1cc2
TM
624 "prof\0"
625 "profil\0"
201c1cc2
TM
626 "putpmsg\0"
627 "query_module\0"
201c1cc2
TM
628 "security\0"
629 "sgetmask\0"
630 "ssetmask\0"
631 "stty\0"
1f9ac68b 632 "sysfs\0"
201c1cc2
TM
633 "tuxcall\0"
634 "ulimit\0"
635 "uselib\0"
1f9ac68b 636 "ustat\0"
201c1cc2 637 "vserver\0"
8130926d 638 },
9493b168
ZJS
639 [SYSCALL_FILTER_SET_PKEY] = {
640 .name = "@pkey",
641 .help = "System calls used for memory protection keys",
642 .value =
643 "pkey_alloc\0"
644 "pkey_free\0"
645 "pkey_mprotect\0"
646 },
8130926d 647 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 648 .name = "@privileged",
d5efc18b 649 .help = "All system calls which need super-user capabilities",
201c1cc2 650 .value =
44898c53 651 "@chown\0"
201c1cc2
TM
652 "@clock\0"
653 "@module\0"
654 "@raw-io\0"
af0f047b
LP
655 "@reboot\0"
656 "@swap\0"
215728ff 657 "_sysctl\0"
201c1cc2 658 "acct\0"
201c1cc2 659 "bpf\0"
1f9ac68b 660 "capset\0"
201c1cc2 661 "chroot\0"
a05cfe23 662 "fanotify_init\0"
9e486265 663 "fanotify_mark\0"
201c1cc2 664 "nfsservctl\0"
a05cfe23 665 "open_by_handle_at\0"
201c1cc2
TM
666 "pivot_root\0"
667 "quotactl\0"
201c1cc2 668 "setdomainname\0"
201c1cc2 669 "setfsuid\0"
215728ff 670 "setfsuid32\0"
201c1cc2 671 "setgroups\0"
215728ff 672 "setgroups32\0"
201c1cc2 673 "sethostname\0"
201c1cc2 674 "setresuid\0"
215728ff 675 "setresuid32\0"
201c1cc2 676 "setreuid\0"
215728ff 677 "setreuid32\0"
e05ee49b 678 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 679 "setuid32\0"
201c1cc2 680 "vhangup\0"
8130926d
LP
681 },
682 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 683 .name = "@process",
d5efc18b 684 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
685 .value =
686 "arch_prctl\0"
09d3020b 687 "capget\0" /* Able to query arbitrary processes */
201c1cc2 688 "clone\0"
9e486265 689 "clone3\0"
201c1cc2
TM
690 "execveat\0"
691 "fork\0"
b887d2eb 692 "getrusage\0"
201c1cc2 693 "kill\0"
9e486265 694 "pidfd_open\0"
46fcf95d 695 "pidfd_send_signal\0"
201c1cc2 696 "prctl\0"
b887d2eb
LP
697 "rt_sigqueueinfo\0"
698 "rt_tgsigqueueinfo\0"
201c1cc2 699 "setns\0"
a9518dc3 700 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 701 "tgkill\0"
b887d2eb 702 "times\0"
201c1cc2
TM
703 "tkill\0"
704 "unshare\0"
705 "vfork\0"
b887d2eb
LP
706 "wait4\0"
707 "waitid\0"
708 "waitpid\0"
8130926d
LP
709 },
710 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 711 .name = "@raw-io",
d5efc18b 712 .help = "Raw I/O port access",
201c1cc2
TM
713 .value =
714 "ioperm\0"
715 "iopl\0"
1f9ac68b 716 "pciconfig_iobase\0"
201c1cc2
TM
717 "pciconfig_read\0"
718 "pciconfig_write\0"
6da432fd 719#if defined __s390__ || defined __s390x__
201c1cc2
TM
720 "s390_pci_mmio_read\0"
721 "s390_pci_mmio_write\0"
8130926d
LP
722#endif
723 },
bd2ab3f4
LP
724 [SYSCALL_FILTER_SET_REBOOT] = {
725 .name = "@reboot",
726 .help = "Reboot and reboot preparation/kexec",
727 .value =
bd2ab3f4 728 "kexec_file_load\0"
e59608fa 729 "kexec_load\0"
bd2ab3f4
LP
730 "reboot\0"
731 },
133ddbbe 732 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 733 .name = "@resources",
58a8f68b 734 .help = "Alter resource settings",
133ddbbe 735 .value =
0963c053
LP
736 "ioprio_set\0"
737 "mbind\0"
738 "migrate_pages\0"
739 "move_pages\0"
740 "nice\0"
0963c053
LP
741 "sched_setaffinity\0"
742 "sched_setattr\0"
133ddbbe
LP
743 "sched_setparam\0"
744 "sched_setscheduler\0"
0963c053 745 "set_mempolicy\0"
133ddbbe
LP
746 "setpriority\0"
747 "setrlimit\0"
133ddbbe 748 },
6eaaeee9
LP
749 [SYSCALL_FILTER_SET_SETUID] = {
750 .name = "@setuid",
751 .help = "Operations for changing user/group credentials",
752 .value =
6eaaeee9 753 "setgid\0"
215728ff 754 "setgid32\0"
6eaaeee9 755 "setgroups\0"
215728ff 756 "setgroups32\0"
6eaaeee9 757 "setregid\0"
215728ff 758 "setregid32\0"
6eaaeee9 759 "setresgid\0"
215728ff 760 "setresgid32\0"
6eaaeee9 761 "setresuid\0"
215728ff 762 "setresuid32\0"
6eaaeee9 763 "setreuid\0"
215728ff 764 "setreuid32\0"
6eaaeee9 765 "setuid\0"
215728ff 766 "setuid32\0"
6eaaeee9 767 },
cd0ddf6f
LP
768 [SYSCALL_FILTER_SET_SIGNAL] = {
769 .name = "@signal",
770 .help = "Process signal handling",
771 .value =
772 "rt_sigaction\0"
773 "rt_sigpending\0"
774 "rt_sigprocmask\0"
775 "rt_sigsuspend\0"
776 "rt_sigtimedwait\0"
6ca67710 777 "rt_sigtimedwait_time64\0"
cd0ddf6f
LP
778 "sigaction\0"
779 "sigaltstack\0"
780 "signal\0"
781 "signalfd\0"
782 "signalfd4\0"
783 "sigpending\0"
784 "sigprocmask\0"
785 "sigsuspend\0"
786 },
bd2ab3f4
LP
787 [SYSCALL_FILTER_SET_SWAP] = {
788 .name = "@swap",
789 .help = "Enable/disable swap devices",
790 .value =
791 "swapoff\0"
792 "swapon\0"
793 },
44898c53
LP
794 [SYSCALL_FILTER_SET_SYNC] = {
795 .name = "@sync",
796 .help = "Synchronize files and memory to storage",
797 .value =
798 "fdatasync\0"
799 "fsync\0"
800 "msync\0"
801 "sync\0"
802 "sync_file_range\0"
a8fb09f5 803 "sync_file_range2\0"
44898c53
LP
804 "syncfs\0"
805 },
70526841
LP
806 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
807 .name = "@system-service",
808 .help = "General system service operations",
809 .value =
810 "@aio\0"
811 "@basic-io\0"
812 "@chown\0"
813 "@default\0"
814 "@file-system\0"
815 "@io-event\0"
816 "@ipc\0"
817 "@keyring\0"
818 "@memlock\0"
819 "@network-io\0"
820 "@process\0"
821 "@resources\0"
822 "@setuid\0"
823 "@signal\0"
824 "@sync\0"
825 "@timer\0"
826 "brk\0"
827 "capget\0"
828 "capset\0"
829 "copy_file_range\0"
830 "fadvise64\0"
831 "fadvise64_64\0"
832 "flock\0"
833 "get_mempolicy\0"
834 "getcpu\0"
835 "getpriority\0"
836 "getrandom\0"
837 "ioctl\0"
838 "ioprio_get\0"
839 "kcmp\0"
840 "madvise\0"
70526841
LP
841 "mprotect\0"
842 "mremap\0"
843 "name_to_handle_at\0"
844 "oldolduname\0"
845 "olduname\0"
846 "personality\0"
847 "readahead\0"
848 "readdir\0"
849 "remap_file_pages\0"
850 "sched_get_priority_max\0"
851 "sched_get_priority_min\0"
852 "sched_getaffinity\0"
853 "sched_getattr\0"
854 "sched_getparam\0"
855 "sched_getscheduler\0"
856 "sched_rr_get_interval\0"
6ca67710 857 "sched_rr_get_interval_time64\0"
70526841
LP
858 "sched_yield\0"
859 "sendfile\0"
860 "sendfile64\0"
861 "setfsgid\0"
862 "setfsgid32\0"
863 "setfsuid\0"
864 "setfsuid32\0"
865 "setpgid\0"
866 "setsid\0"
867 "splice\0"
868 "sysinfo\0"
869 "tee\0"
870 "umask\0"
871 "uname\0"
872 "userfaultfd\0"
873 "vmsplice\0"
874 },
cd0ddf6f
LP
875 [SYSCALL_FILTER_SET_TIMER] = {
876 .name = "@timer",
877 .help = "Schedule operations by time",
878 .value =
879 "alarm\0"
880 "getitimer\0"
881 "setitimer\0"
882 "timer_create\0"
883 "timer_delete\0"
884 "timer_getoverrun\0"
885 "timer_gettime\0"
6ca67710 886 "timer_gettime64\0"
cd0ddf6f 887 "timer_settime\0"
6ca67710 888 "timer_settime64\0"
cd0ddf6f
LP
889 "timerfd_create\0"
890 "timerfd_gettime\0"
6ca67710 891 "timerfd_gettime64\0"
cd0ddf6f 892 "timerfd_settime\0"
6ca67710 893 "timerfd_settime64\0"
cd0ddf6f
LP
894 "times\0"
895 },
201c1cc2 896};
8130926d
LP
897
898const SyscallFilterSet *syscall_filter_set_find(const char *name) {
899 unsigned i;
900
901 if (isempty(name) || name[0] != '@')
902 return NULL;
903
904 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
905 if (streq(syscall_filter_sets[i].name, name))
906 return syscall_filter_sets + i;
907
908 return NULL;
909}
910
b54f36c6 911static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
69b1b241 912
b54f36c6 913int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
69b1b241
LP
914 assert(seccomp);
915 assert(name);
916
960e4569
LP
917 if (strv_contains(exclude, name))
918 return 0;
919
69b1b241
LP
920 if (name[0] == '@') {
921 const SyscallFilterSet *other;
922
923 other = syscall_filter_set_find(name);
baaa35ad
ZJS
924 if (!other)
925 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
926 "Filter set %s is not known!",
927 name);
69b1b241 928
b54f36c6
ZJS
929 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
930
69b1b241 931 } else {
b54f36c6 932 int id, r;
69b1b241
LP
933
934 id = seccomp_syscall_resolve_name(name);
cff7bff8 935 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
936 if (log_missing)
937 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 938 return 0;
cff7bff8 939 }
69b1b241
LP
940
941 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 942 if (r < 0) {
69b1b241 943 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
944 bool ignore = r == -EDOM;
945
946 if (!ignore || log_missing)
947 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
948 name, id, ignore ? ", ignoring" : "");
949 if (!ignore)
950 return r;
b54f36c6 951 }
69b1b241 952
b54f36c6
ZJS
953 return 0;
954 }
69b1b241
LP
955}
956
469830d1
LP
957static int seccomp_add_syscall_filter_set(
958 scmp_filter_ctx seccomp,
469830d1 959 const SyscallFilterSet *set,
960e4569 960 uint32_t action,
b54f36c6
ZJS
961 char **exclude,
962 bool log_missing) {
469830d1 963
8130926d
LP
964 const char *sys;
965 int r;
966
967 assert(seccomp);
968 assert(set);
969
970 NULSTR_FOREACH(sys, set->value) {
b54f36c6 971 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
69b1b241
LP
972 if (r < 0)
973 return r;
469830d1
LP
974 }
975
976 return 0;
977}
978
b54f36c6 979int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
980 uint32_t arch;
981 int r;
982
983 assert(set);
984
985 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 986 * each local arch. */
469830d1
LP
987
988 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
989 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
990
991 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
992
993 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
994 if (r < 0)
995 return r;
469830d1 996
b54f36c6 997 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
7e86bd73
ZJS
998 if (r < 0)
999 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
1000
1001 r = seccomp_load(seccomp);
7bc5e0b1 1002 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1003 return r;
1004 if (r < 0)
1005 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
1006 }
1007
1008 return 0;
1009}
a3be2849 1010
b54f36c6 1011int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 1012 uint32_t arch;
a3be2849
LP
1013 int r;
1014
469830d1
LP
1015 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1016 * SyscallFilterSet* table. */
a3be2849 1017
8cfa775f 1018 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 1019 return 0;
a3be2849 1020
469830d1
LP
1021 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1022 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1023 Iterator i;
b54f36c6 1024 void *syscall_id, *val;
a3be2849 1025
469830d1 1026 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 1027
469830d1
LP
1028 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1029 if (r < 0)
1030 return r;
a3be2849 1031
b54f36c6 1032 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
8cfa775f 1033 uint32_t a = action;
b54f36c6
ZJS
1034 int id = PTR_TO_INT(syscall_id) - 1;
1035 int error = PTR_TO_INT(val);
8cfa775f 1036
b54f36c6
ZJS
1037 if (action != SCMP_ACT_ALLOW && error >= 0)
1038 a = SCMP_ACT_ERRNO(error);
8cfa775f 1039
b54f36c6 1040 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
1041 if (r < 0) {
1042 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1043 _cleanup_free_ char *n = NULL;
7e86bd73 1044 bool ignore;
469830d1 1045
b54f36c6 1046 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
1047 ignore = r == -EDOM;
1048 if (!ignore || log_missing)
1049 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1050 strna(n), id, ignore ? ", ignoring" : "");
1051 if (!ignore)
1052 return r;
469830d1
LP
1053 }
1054 }
1055
1056 r = seccomp_load(seccomp);
7bc5e0b1 1057 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1058 return r;
1059 if (r < 0)
1060 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1061 }
1062
1063 return 0;
add00535
LP
1064}
1065
58f6ab44 1066int seccomp_parse_syscall_filter(
898748d8
YW
1067 const char *name,
1068 int errno_num,
1069 Hashmap *filter,
13d92c63 1070 SeccompParseFlags flags,
898748d8
YW
1071 const char *unit,
1072 const char *filename,
1073 unsigned line) {
1074
1075 int r;
1076
1077 assert(name);
1078 assert(filter);
1079
1080 if (name[0] == '@') {
1081 const SyscallFilterSet *set;
1082 const char *i;
1083
1084 set = syscall_filter_set_find(name);
1085 if (!set) {
13d92c63 1086 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1087 return -EINVAL;
13d92c63
LP
1088
1089 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1090 "Unknown system call group, ignoring: %s", name);
1091 return 0;
898748d8
YW
1092 }
1093
1094 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1095 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1096 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1097 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1098 * about them. */
58f6ab44 1099 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1100 if (r < 0)
1101 return r;
1102 }
1103 } else {
1104 int id;
1105
1106 id = seccomp_syscall_resolve_name(name);
1107 if (id == __NR_SCMP_ERROR) {
13d92c63 1108 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1109 return -EINVAL;
13d92c63
LP
1110
1111 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1112 "Failed to parse system call, ignoring: %s", name);
1113 return 0;
898748d8
YW
1114 }
1115
1116 /* If we previously wanted to forbid a syscall and now
1117 * we want to allow it, then remove it from the list. */
6b000af4 1118 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
898748d8
YW
1119 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1120 if (r < 0)
851ee70a
LW
1121 switch (r) {
1122 case -ENOMEM:
1123 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1124 case -EEXIST:
9d7fe7c6
LW
1125 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1126 break;
851ee70a
LW
1127 default:
1128 return r;
1129 }
898748d8
YW
1130 } else
1131 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1132 }
1133
1134 return 0;
1135}
1136
add00535 1137int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1138 uint32_t arch;
add00535
LP
1139 int r;
1140
f1d34068 1141 if (DEBUG_LOGGING) {
add00535
LP
1142 _cleanup_free_ char *s = NULL;
1143
86c2a9f1 1144 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1145 log_debug("Restricting namespace to: %s.", strna(s));
1146 }
1147
1148 /* NOOP? */
1149 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1150 return 0;
1151
469830d1
LP
1152 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1153 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1154 unsigned i;
add00535 1155
469830d1
LP
1156 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1157
1158 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1159 if (r < 0)
1160 return r;
1161
1162 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1163 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1164 * altogether. */
1165 r = seccomp_rule_add_exact(
1166 seccomp,
1167 SCMP_ACT_ERRNO(EPERM),
1168 SCMP_SYS(setns),
1169 0);
1170 else
1171 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1172 * special invocation with a zero flags argument, right here. */
1173 r = seccomp_rule_add_exact(
1174 seccomp,
1175 SCMP_ACT_ERRNO(EPERM),
1176 SCMP_SYS(setns),
1177 1,
1178 SCMP_A1(SCMP_CMP_EQ, 0));
1179 if (r < 0) {
1180 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1181 continue;
1182 }
1183
1184 for (i = 0; namespace_flag_map[i].name; i++) {
1185 unsigned long f;
1186
1187 f = namespace_flag_map[i].flag;
1188 if ((retain & f) == f) {
1189 log_debug("Permitting %s.", namespace_flag_map[i].name);
1190 continue;
1191 }
1192
1193 log_debug("Blocking %s.", namespace_flag_map[i].name);
1194
1195 r = seccomp_rule_add_exact(
1196 seccomp,
1197 SCMP_ACT_ERRNO(EPERM),
1198 SCMP_SYS(unshare),
1199 1,
1200 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1201 if (r < 0) {
1202 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1203 break;
1204 }
1205
511ceb1f
ZJS
1206 /* On s390/s390x the first two parameters to clone are switched */
1207 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1208 r = seccomp_rule_add_exact(
1209 seccomp,
1210 SCMP_ACT_ERRNO(EPERM),
1211 SCMP_SYS(clone),
1212 1,
1213 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1214 else
1215 r = seccomp_rule_add_exact(
1216 seccomp,
1217 SCMP_ACT_ERRNO(EPERM),
1218 SCMP_SYS(clone),
1219 1,
1220 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1221 if (r < 0) {
1222 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1223 break;
1224 }
1225
1226 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1227 r = seccomp_rule_add_exact(
1228 seccomp,
1229 SCMP_ACT_ERRNO(EPERM),
1230 SCMP_SYS(setns),
1231 1,
1232 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1233 if (r < 0) {
1234 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1235 break;
1236 }
1237 }
1238 }
1239 if (r < 0)
1240 continue;
1241
1242 r = seccomp_load(seccomp);
7bc5e0b1 1243 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1244 return r;
1245 if (r < 0)
1246 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1247 }
1248
1249 return 0;
1250}
1251
1252int seccomp_protect_sysctl(void) {
1253 uint32_t arch;
1254 int r;
1255
1256 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1257 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1258
1259 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1260
f9252236
AJ
1261 if (IN_SET(arch,
1262 SCMP_ARCH_AARCH64,
1263#ifdef SCMP_ARCH_RISCV64
1264 SCMP_ARCH_RISCV64,
1265#endif
1266 SCMP_ARCH_X32
1267 ))
2e64e8f4
ZJS
1268 /* No _sysctl syscall */
1269 continue;
1270
469830d1
LP
1271 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1272 if (r < 0)
1273 return r;
1274
1275 r = seccomp_rule_add_exact(
add00535
LP
1276 seccomp,
1277 SCMP_ACT_ERRNO(EPERM),
469830d1 1278 SCMP_SYS(_sysctl),
add00535 1279 0);
469830d1
LP
1280 if (r < 0) {
1281 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1282 continue;
1283 }
1284
1285 r = seccomp_load(seccomp);
7bc5e0b1 1286 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1287 return r;
1288 if (r < 0)
1289 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1290 }
1291
1292 return 0;
1293}
1294
620dbdd2
KK
1295int seccomp_protect_syslog(void) {
1296 uint32_t arch;
1297 int r;
1298
1299 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1300 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1301
1302 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1303 if (r < 0)
1304 return r;
1305
1306 r = seccomp_rule_add_exact(
1307 seccomp,
1308 SCMP_ACT_ERRNO(EPERM),
1309 SCMP_SYS(syslog),
1310 0);
1311
1312 if (r < 0) {
1313 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1314 continue;
1315 }
1316
1317 r = seccomp_load(seccomp);
1318 if (ERRNO_IS_SECCOMP_FATAL(r))
1319 return r;
1320 if (r < 0)
1321 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1322 }
1323
1324 return 0;
1325}
1326
6b000af4 1327int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
469830d1
LP
1328 uint32_t arch;
1329 int r;
1330
1331 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1332 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1333 bool supported;
469830d1
LP
1334 Iterator i;
1335
1336 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1337
9606bc4b
LP
1338 switch (arch) {
1339
1340 case SCMP_ARCH_X86_64:
1341 case SCMP_ARCH_X32:
1342 case SCMP_ARCH_ARM:
1343 case SCMP_ARCH_AARCH64:
0d9fca76 1344 case SCMP_ARCH_PPC:
da1921a5
ZJS
1345 case SCMP_ARCH_PPC64:
1346 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1347 case SCMP_ARCH_MIPSEL64N32:
1348 case SCMP_ARCH_MIPS64N32:
1349 case SCMP_ARCH_MIPSEL64:
1350 case SCMP_ARCH_MIPS64:
f9252236
AJ
1351#ifdef SCMP_ARCH_RISCV64
1352 case SCMP_ARCH_RISCV64:
1353#endif
9606bc4b
LP
1354 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1355 supported = true;
1356 break;
1357
9606bc4b
LP
1358 case SCMP_ARCH_S390:
1359 case SCMP_ARCH_S390X:
da1921a5 1360 case SCMP_ARCH_X86:
f5aeac14
JC
1361 case SCMP_ARCH_MIPSEL:
1362 case SCMP_ARCH_MIPS:
9606bc4b
LP
1363 default:
1364 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1365 * don't know */
1366 supported = false;
1367 break;
1368 }
1369
1370 if (!supported)
1371 continue;
1372
469830d1
LP
1373 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1374 if (r < 0)
1375 return r;
1376
6b000af4 1377 if (allow_list) {
469830d1
LP
1378 int af, first = 0, last = 0;
1379 void *afp;
1380
6b000af4
LP
1381 /* If this is an allow list, we first block the address families that are out of
1382 * range and then everything that is not in the set. First, we find the lowest and
1383 * highest address family in the set. */
469830d1
LP
1384
1385 SET_FOREACH(afp, address_families, i) {
1386 af = PTR_TO_INT(afp);
1387
1388 if (af <= 0 || af >= af_max())
1389 continue;
1390
1391 if (first == 0 || af < first)
1392 first = af;
1393
1394 if (last == 0 || af > last)
1395 last = af;
1396 }
1397
1398 assert((first == 0) == (last == 0));
1399
1400 if (first == 0) {
1401
1402 /* No entries in the valid range, block everything */
1403 r = seccomp_rule_add_exact(
1404 seccomp,
1405 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1406 SCMP_SYS(socket),
1407 0);
1408 if (r < 0) {
1409 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1410 continue;
1411 }
1412
1413 } else {
1414
1415 /* Block everything below the first entry */
1416 r = seccomp_rule_add_exact(
1417 seccomp,
1418 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1419 SCMP_SYS(socket),
1420 1,
1421 SCMP_A0(SCMP_CMP_LT, first));
1422 if (r < 0) {
1423 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1424 continue;
1425 }
1426
1427 /* Block everything above the last entry */
1428 r = seccomp_rule_add_exact(
1429 seccomp,
1430 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1431 SCMP_SYS(socket),
1432 1,
1433 SCMP_A0(SCMP_CMP_GT, last));
1434 if (r < 0) {
1435 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1436 continue;
1437 }
1438
1439 /* Block everything between the first and last entry */
1440 for (af = 1; af < af_max(); af++) {
1441
1442 if (set_contains(address_families, INT_TO_PTR(af)))
1443 continue;
1444
1445 r = seccomp_rule_add_exact(
1446 seccomp,
1447 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1448 SCMP_SYS(socket),
1449 1,
1450 SCMP_A0(SCMP_CMP_EQ, af));
1451 if (r < 0)
1452 break;
1453 }
469830d1
LP
1454 if (r < 0) {
1455 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1456 continue;
1457 }
1458 }
1459
1460 } else {
1461 void *af;
1462
6b000af4
LP
1463 /* If this is a deny list, then generate one rule for each address family that are
1464 * then combined in OR checks. */
469830d1
LP
1465
1466 SET_FOREACH(af, address_families, i) {
1467
1468 r = seccomp_rule_add_exact(
1469 seccomp,
1470 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1471 SCMP_SYS(socket),
1472 1,
1473 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1474 if (r < 0)
1475 break;
1476 }
469830d1
LP
1477 if (r < 0) {
1478 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1479 continue;
1480 }
1481 }
1482
1483 r = seccomp_load(seccomp);
7bc5e0b1 1484 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1485 return r;
1486 if (r < 0)
1487 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1488 }
1489
1490 return 0;
1491}
1492
1493int seccomp_restrict_realtime(void) {
1494 static const int permitted_policies[] = {
1495 SCHED_OTHER,
1496 SCHED_BATCH,
1497 SCHED_IDLE,
1498 };
1499
1500 int r, max_policy = 0;
1501 uint32_t arch;
1502 unsigned i;
1503
1504 /* Determine the highest policy constant we want to allow */
1505 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1506 if (permitted_policies[i] > max_policy)
1507 max_policy = permitted_policies[i];
1508
1509 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1510 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1511 int p;
1512
1513 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1514
1515 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1516 if (r < 0)
1517 return r;
1518
1519 /* Go through all policies with lower values than that, and block them -- unless they appear in the
6b000af4 1520 * allow list. */
469830d1
LP
1521 for (p = 0; p < max_policy; p++) {
1522 bool good = false;
1523
6b000af4 1524 /* Check if this is in the allow list. */
469830d1
LP
1525 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1526 if (permitted_policies[i] == p) {
1527 good = true;
1528 break;
1529 }
1530
1531 if (good)
1532 continue;
1533
1534 /* Deny this policy */
1535 r = seccomp_rule_add_exact(
1536 seccomp,
1537 SCMP_ACT_ERRNO(EPERM),
1538 SCMP_SYS(sched_setscheduler),
1539 1,
1540 SCMP_A1(SCMP_CMP_EQ, p));
1541 if (r < 0) {
1542 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1543 continue;
1544 }
1545 }
1546
6b000af4
LP
1547 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1548 * are unsigned here, hence no need no check for < 0 values. */
469830d1 1549 r = seccomp_rule_add_exact(
add00535
LP
1550 seccomp,
1551 SCMP_ACT_ERRNO(EPERM),
469830d1 1552 SCMP_SYS(sched_setscheduler),
add00535 1553 1,
469830d1
LP
1554 SCMP_A1(SCMP_CMP_GT, max_policy));
1555 if (r < 0) {
1556 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1557 continue;
1558 }
add00535 1559
469830d1 1560 r = seccomp_load(seccomp);
7bc5e0b1 1561 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1
LP
1562 return r;
1563 if (r < 0)
1564 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1565 }
1566
1567 return 0;
1568}
1569
6dc66688
ZJS
1570static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1571 uint32_t arch,
1572 int nr,
14cb109d 1573 unsigned arg_cnt,
6dc66688
ZJS
1574 const struct scmp_arg_cmp arg) {
1575 int r;
1576
1577 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1578 if (r < 0) {
1579 _cleanup_free_ char *n = NULL;
1580
1581 n = seccomp_syscall_resolve_num_arch(arch, nr);
1582 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1583 strna(n),
1584 seccomp_arch_to_string(arch));
1585 }
1586
1587 return r;
1588}
1589
2a8d6e63 1590/* For known architectures, check that syscalls are indeed defined or not. */
f9252236 1591#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
2a8d6e63
ZJS
1592assert_cc(SCMP_SYS(shmget) > 0);
1593assert_cc(SCMP_SYS(shmat) > 0);
1594assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1595#endif
6dc66688 1596
469830d1
LP
1597int seccomp_memory_deny_write_execute(void) {
1598 uint32_t arch;
b069c2a3 1599 unsigned loaded = 0;
469830d1
LP
1600
1601 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1602 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
b069c2a3 1603 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
add00535 1604
469830d1
LP
1605 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1606
8a50cf69
LP
1607 switch (arch) {
1608
bed4668d
CE
1609 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1610 * We ignore that here, which means there's still a way to get writable/executable
1611 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1612
8a50cf69 1613 case SCMP_ARCH_X86:
57311925 1614 case SCMP_ARCH_S390:
8a50cf69
LP
1615 filter_syscall = SCMP_SYS(mmap2);
1616 block_syscall = SCMP_SYS(mmap);
bed4668d 1617 /* shmat multiplexed, see above */
2a8d6e63
ZJS
1618 break;
1619
63d00dfb 1620 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1621 case SCMP_ARCH_PPC64:
1622 case SCMP_ARCH_PPC64LE:
bed4668d 1623 case SCMP_ARCH_S390X:
2a8d6e63 1624 filter_syscall = SCMP_SYS(mmap);
bed4668d 1625 /* shmat multiplexed, see above */
8a50cf69
LP
1626 break;
1627
4278d1f5
ZJS
1628 case SCMP_ARCH_ARM:
1629 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1630 shmat_syscall = SCMP_SYS(shmat);
1631 break;
1632
8a50cf69
LP
1633 case SCMP_ARCH_X86_64:
1634 case SCMP_ARCH_X32:
79873bc8 1635 case SCMP_ARCH_AARCH64:
f9252236
AJ
1636#ifdef SCMP_ARCH_RISCV64
1637 case SCMP_ARCH_RISCV64:
1638#endif
1639 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
8a50cf69
LP
1640 shmat_syscall = SCMP_SYS(shmat);
1641 break;
1642
1643 /* Please add more definitions here, if you port systemd to other architectures! */
1644
f9252236 1645#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
8a50cf69
LP
1646#warning "Consider adding the right mmap() syscall definitions here!"
1647#endif
1648 }
1649
1650 /* Can't filter mmap() on this arch, then skip it */
1651 if (filter_syscall == 0)
1652 continue;
1653
469830d1
LP
1654 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1655 if (r < 0)
1656 return r;
1657
6dc66688
ZJS
1658 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1659 1,
1660 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1661 if (r < 0)
1662 continue;
8a50cf69
LP
1663
1664 if (block_syscall != 0) {
6dc66688
ZJS
1665 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1666 if (r < 0)
8a50cf69 1667 continue;
add00535 1668 }
a3be2849 1669
6dc66688
ZJS
1670 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1671 1,
b835eeb4
ZJS
1672 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1673 if (r < 0)
1674 continue;
1675
91691f1d 1676#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1677 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1678 1,
6dc66688
ZJS
1679 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1680 if (r < 0)
469830d1 1681 continue;
91691f1d 1682#endif
add00535 1683
67fb5f33 1684 if (shmat_syscall > 0) {
5ef3ed97 1685 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
6dc66688
ZJS
1686 1,
1687 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1688 if (r < 0)
8a50cf69 1689 continue;
469830d1
LP
1690 }
1691
1692 r = seccomp_load(seccomp);
7bc5e0b1 1693 if (ERRNO_IS_SECCOMP_FATAL(r))
469830d1 1694 return r;
add00535 1695 if (r < 0)
b069c2a3
ZJS
1696 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1697 seccomp_arch_to_string(arch));
903659e7 1698 loaded++;
469830d1 1699 }
add00535 1700
903659e7 1701 if (loaded == 0)
b069c2a3 1702 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
903659e7
CE
1703
1704 return loaded;
469830d1
LP
1705}
1706
1707int seccomp_restrict_archs(Set *archs) {
1708 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1709 Iterator i;
1710 void *id;
1711 int r;
1712
1713 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1714 * list.
1715 *
1716 * There are some qualifications. However the most important use is to stop processes from bypassing
1717 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1718 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1719
2428aaf8
AJ
1720 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1721 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1722 * to run a program with the restrictions applied. */
469830d1
LP
1723 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1724 if (!seccomp)
1725 return -ENOMEM;
1726
1727 SET_FOREACH(id, archs, i) {
1728 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1729 if (r < 0 && r != -EEXIST)
1730 return r;
1731 }
1732
1733 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1734 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1735 * The important thing is that you can block the old 32-bit x86 syscalls.
1736 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1737
1738 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1739 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1740
1741 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1742 if (r < 0 && r != -EEXIST)
469830d1 1743 return r;
add00535
LP
1744 }
1745
469830d1
LP
1746 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1747 if (r < 0)
1748 return r;
add00535 1749
1c6af69b 1750 r = seccomp_load(seccomp);
7bc5e0b1 1751 if (ERRNO_IS_SECCOMP_FATAL(r))
1c6af69b
LP
1752 return r;
1753 if (r < 0)
1754 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1755
1756 return 0;
a3be2849 1757}
b16bd535 1758
de7fef4b
ZJS
1759int parse_syscall_archs(char **l, Set **ret_archs) {
1760 _cleanup_set_free_ Set *archs = NULL;
b16bd535
YW
1761 char **s;
1762 int r;
1763
1764 assert(l);
de7fef4b 1765 assert(ret_archs);
b16bd535
YW
1766
1767 STRV_FOREACH(s, l) {
1768 uint32_t a;
1769
1770 r = seccomp_arch_from_string(*s, &a);
1771 if (r < 0)
1772 return -EINVAL;
1773
de7fef4b 1774 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
b16bd535
YW
1775 if (r < 0)
1776 return -ENOMEM;
1777 }
1778
de7fef4b 1779 *ret_archs = TAKE_PTR(archs);
b16bd535
YW
1780 return 0;
1781}
165a31c0 1782
8cfa775f 1783int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1784 const char *i;
1785 int r;
1786
1787 assert(set);
1788
1789 NULSTR_FOREACH(i, set->value) {
1790
1791 if (i[0] == '@') {
1792 const SyscallFilterSet *more;
1793
1794 more = syscall_filter_set_find(i);
1795 if (!more)
1796 return -ENXIO;
1797
165a31c0
LP
1798 r = seccomp_filter_set_add(filter, add, more);
1799 if (r < 0)
1800 return r;
1801 } else {
1802 int id;
1803
1804 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1805 if (id == __NR_SCMP_ERROR) {
1806 log_debug("Couldn't resolve system call, ignoring: %s", i);
1807 continue;
1808 }
165a31c0
LP
1809
1810 if (add) {
8cfa775f 1811 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1812 if (r < 0)
1813 return r;
1814 } else
8cfa775f 1815 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1816 }
1817 }
1818
1819 return 0;
1820}
78e864e5
TM
1821
1822int seccomp_lock_personality(unsigned long personality) {
72eafe71 1823 uint32_t arch;
78e864e5
TM
1824 int r;
1825
72eafe71
LP
1826 if (personality >= PERSONALITY_INVALID)
1827 return -EINVAL;
78e864e5 1828
72eafe71
LP
1829 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1830 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1831
72eafe71
LP
1832 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1833 if (r < 0)
1834 return r;
1835
1836 r = seccomp_rule_add_exact(
1837 seccomp,
1838 SCMP_ACT_ERRNO(EPERM),
1839 SCMP_SYS(personality),
1840 1,
1841 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1842 if (r < 0) {
1843 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1844 continue;
1845 }
72eafe71
LP
1846
1847 r = seccomp_load(seccomp);
7bc5e0b1 1848 if (ERRNO_IS_SECCOMP_FATAL(r))
72eafe71
LP
1849 return r;
1850 if (r < 0)
1851 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1852 }
1853
1854 return 0;
78e864e5 1855}
aecd5ac6
TM
1856
1857int seccomp_protect_hostname(void) {
1858 uint32_t arch;
1859 int r;
1860
1861 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1862 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1863
1864 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1865 if (r < 0)
1866 return r;
1867
1868 r = seccomp_rule_add_exact(
1869 seccomp,
1870 SCMP_ACT_ERRNO(EPERM),
1871 SCMP_SYS(sethostname),
1872 0);
9e6e543c
LP
1873 if (r < 0) {
1874 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1875 continue;
9e6e543c 1876 }
aecd5ac6
TM
1877
1878 r = seccomp_rule_add_exact(
1879 seccomp,
1880 SCMP_ACT_ERRNO(EPERM),
1881 SCMP_SYS(setdomainname),
1882 0);
9e6e543c
LP
1883 if (r < 0) {
1884 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
aecd5ac6 1885 continue;
9e6e543c 1886 }
aecd5ac6
TM
1887
1888 r = seccomp_load(seccomp);
7bc5e0b1 1889 if (ERRNO_IS_SECCOMP_FATAL(r))
aecd5ac6
TM
1890 return r;
1891 if (r < 0)
1892 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1893 }
1894
1895 return 0;
1896}
3c27973b 1897
da4dc9a6
ZJS
1898static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1899 /* Checks the mode_t parameter of the following system calls:
1900 *
1901 * → chmod() + fchmod() + fchmodat()
1902 * → open() + creat() + openat()
1903 * → mkdir() + mkdirat()
1904 * → mknod() + mknodat()
1905 *
1906 * Returns error if *everything* failed, and 0 otherwise.
1907 */
1908 int r = 0;
1909 bool any = false;
1910
1911 r = seccomp_rule_add_exact(
1912 seccomp,
1913 SCMP_ACT_ERRNO(EPERM),
1914 SCMP_SYS(chmod),
1915 1,
1916 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1917 if (r < 0)
1918 log_debug_errno(r, "Failed to add filter for chmod: %m");
1919 else
1920 any = true;
1921
1922 r = seccomp_rule_add_exact(
1923 seccomp,
1924 SCMP_ACT_ERRNO(EPERM),
1925 SCMP_SYS(fchmod),
1926 1,
1927 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1928 if (r < 0)
1929 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1930 else
1931 any = true;
1932
1933 r = seccomp_rule_add_exact(
1934 seccomp,
1935 SCMP_ACT_ERRNO(EPERM),
1936 SCMP_SYS(fchmodat),
1937 1,
1938 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1939 if (r < 0)
1940 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1941 else
1942 any = true;
1943
1944 r = seccomp_rule_add_exact(
1945 seccomp,
1946 SCMP_ACT_ERRNO(EPERM),
1947 SCMP_SYS(mkdir),
1948 1,
1949 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1950 if (r < 0)
1951 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1952 else
1953 any = true;
1954
1955 r = seccomp_rule_add_exact(
1956 seccomp,
1957 SCMP_ACT_ERRNO(EPERM),
1958 SCMP_SYS(mkdirat),
1959 1,
1960 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1961 if (r < 0)
1962 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1963 else
1964 any = true;
1965
1966 r = seccomp_rule_add_exact(
1967 seccomp,
1968 SCMP_ACT_ERRNO(EPERM),
1969 SCMP_SYS(mknod),
1970 1,
1971 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1972 if (r < 0)
1973 log_debug_errno(r, "Failed to add filter for mknod: %m");
1974 else
1975 any = true;
1976
1977 r = seccomp_rule_add_exact(
1978 seccomp,
1979 SCMP_ACT_ERRNO(EPERM),
1980 SCMP_SYS(mknodat),
1981 1,
1982 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1983 if (r < 0)
1984 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1985 else
1986 any = true;
1987
1988#if SCMP_SYS(open) > 0
1989 r = seccomp_rule_add_exact(
1990 seccomp,
1991 SCMP_ACT_ERRNO(EPERM),
1992 SCMP_SYS(open),
1993 2,
1994 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1995 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1996 if (r < 0)
1997 log_debug_errno(r, "Failed to add filter for open: %m");
1998 else
1999 any = true;
2000#endif
2001
2002 r = seccomp_rule_add_exact(
2003 seccomp,
2004 SCMP_ACT_ERRNO(EPERM),
2005 SCMP_SYS(openat),
2006 2,
2007 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2008 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2009 if (r < 0)
2010 log_debug_errno(r, "Failed to add filter for openat: %m");
2011 else
2012 any = true;
2013
ecc04067
LP
2014#if defined(__SNR_openat2)
2015 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2016 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2017 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2018 * compatible with kernels that are not absolutely recent. */
2019 r = seccomp_rule_add_exact(
2020 seccomp,
2021 SCMP_ACT_ERRNO(EPERM),
2022 SCMP_SYS(openat2),
2023 0);
2024 if (r < 0)
2025 log_debug_errno(r, "Failed to add filter for openat2: %m");
2026 else
2027 any = true;
2028#endif
2029
da4dc9a6
ZJS
2030 r = seccomp_rule_add_exact(
2031 seccomp,
2032 SCMP_ACT_ERRNO(EPERM),
2033 SCMP_SYS(creat),
2034 1,
2035 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2036 if (r < 0)
2037 log_debug_errno(r, "Failed to add filter for creat: %m");
2038 else
2039 any = true;
2040
2041 return any ? 0 : r;
2042}
2043
3c27973b
LP
2044int seccomp_restrict_suid_sgid(void) {
2045 uint32_t arch;
da4dc9a6 2046 int r, k;
3c27973b
LP
2047
2048 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2049 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2050
2051 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2052 if (r < 0)
2053 return r;
2054
da4dc9a6
ZJS
2055 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2056 if (r < 0)
2057 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2058
da4dc9a6
ZJS
2059 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2060 if (k < 0)
2061 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
3c27973b 2062
da4dc9a6 2063 if (r < 0 && k < 0)
3c27973b 2064 continue;
3c27973b
LP
2065
2066 r = seccomp_load(seccomp);
7bc5e0b1 2067 if (ERRNO_IS_SECCOMP_FATAL(r))
3c27973b
LP
2068 return r;
2069 if (r < 0)
2070 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2071 }
2072
2073 return 0;
2074}
915fb324
LP
2075
2076uint32_t scmp_act_kill_process(void) {
2077
2078 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2079 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2080 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2081 * for single-threaded apps does the right thing. */
2082
2083#ifdef SCMP_ACT_KILL_PROCESS
2084 if (seccomp_api_get() >= 3)
2085 return SCMP_ACT_KILL_PROCESS;
2086#endif
2087
2088 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2089}