]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/seccomp-util.c
seccomp: add rseq() to default list of syscalls to whitelist
[thirdparty/systemd.git] / src / shared / seccomp-util.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
57183d11 2
a8fbdf54 3#include <errno.h>
469830d1 4#include <linux/seccomp.h>
57183d11 5#include <seccomp.h>
a8fbdf54 6#include <stddef.h>
469830d1 7#include <sys/mman.h>
d347d902 8#include <sys/prctl.h>
469830d1 9#include <sys/shm.h>
57183d11 10
469830d1 11#include "af-list.h"
add00535 12#include "alloc-util.h"
d8b4d14d 13#include "errno-list.h"
a8fbdf54 14#include "macro.h"
add00535 15#include "nsflags.h"
d8b4d14d 16#include "nulstr-util.h"
78e864e5 17#include "process-util.h"
cf0fbc49 18#include "seccomp-util.h"
b16bd535 19#include "set.h"
07630cea 20#include "string-util.h"
b16bd535 21#include "strv.h"
469830d1
LP
22
23const uint32_t seccomp_local_archs[] = {
24
f2d9751c
LP
25 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
26
27#if defined(__x86_64__) && defined(__ILP32__)
469830d1
LP
28 SCMP_ARCH_X86,
29 SCMP_ARCH_X86_64,
f2d9751c
LP
30 SCMP_ARCH_X32, /* native */
31#elif defined(__x86_64__) && !defined(__ILP32__)
32 SCMP_ARCH_X86,
469830d1 33 SCMP_ARCH_X32,
f2d9751c
LP
34 SCMP_ARCH_X86_64, /* native */
35#elif defined(__i386__)
36 SCMP_ARCH_X86,
37#elif defined(__aarch64__)
469830d1 38 SCMP_ARCH_ARM,
f2d9751c
LP
39 SCMP_ARCH_AARCH64, /* native */
40#elif defined(__arm__)
41 SCMP_ARCH_ARM,
42#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
43 SCMP_ARCH_MIPSEL,
44 SCMP_ARCH_MIPS, /* native */
45#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
469830d1 46 SCMP_ARCH_MIPS,
f2d9751c
LP
47 SCMP_ARCH_MIPSEL, /* native */
48#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
49 SCMP_ARCH_MIPSEL,
50 SCMP_ARCH_MIPS,
51 SCMP_ARCH_MIPSEL64N32,
469830d1 52 SCMP_ARCH_MIPS64N32,
f2d9751c
LP
53 SCMP_ARCH_MIPSEL64,
54 SCMP_ARCH_MIPS64, /* native */
55#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPS,
469830d1 57 SCMP_ARCH_MIPSEL,
f2d9751c
LP
58 SCMP_ARCH_MIPS64N32,
59 SCMP_ARCH_MIPSEL64N32,
60 SCMP_ARCH_MIPS64,
61 SCMP_ARCH_MIPSEL64, /* native */
62#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
63 SCMP_ARCH_MIPSEL,
64 SCMP_ARCH_MIPS,
469830d1 65 SCMP_ARCH_MIPSEL64,
f2d9751c 66 SCMP_ARCH_MIPS64,
469830d1 67 SCMP_ARCH_MIPSEL64N32,
f2d9751c
LP
68 SCMP_ARCH_MIPS64N32, /* native */
69#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPS,
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS64,
73 SCMP_ARCH_MIPSEL64,
74 SCMP_ARCH_MIPS64N32,
75 SCMP_ARCH_MIPSEL64N32, /* native */
76#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
469830d1 77 SCMP_ARCH_PPC,
469830d1 78 SCMP_ARCH_PPC64LE,
f2d9751c
LP
79 SCMP_ARCH_PPC64, /* native */
80#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
81 SCMP_ARCH_PPC,
82 SCMP_ARCH_PPC64,
83 SCMP_ARCH_PPC64LE, /* native */
84#elif defined(__powerpc__)
85 SCMP_ARCH_PPC,
86#elif defined(__s390x__)
87 SCMP_ARCH_S390,
88 SCMP_ARCH_S390X, /* native */
89#elif defined(__s390__)
469830d1 90 SCMP_ARCH_S390,
469830d1
LP
91#endif
92 (uint32_t) -1
93 };
57183d11
LP
94
95const char* seccomp_arch_to_string(uint32_t c) {
aa34055f
ZJS
96 /* Maintain order used in <seccomp.h>.
97 *
98 * Names used here should be the same as those used for ConditionArchitecture=,
99 * except for "subarchitectures" like x32. */
57183d11 100
aa34055f
ZJS
101 switch(c) {
102 case SCMP_ARCH_NATIVE:
57183d11 103 return "native";
aa34055f 104 case SCMP_ARCH_X86:
57183d11 105 return "x86";
aa34055f 106 case SCMP_ARCH_X86_64:
57183d11 107 return "x86-64";
aa34055f 108 case SCMP_ARCH_X32:
57183d11 109 return "x32";
aa34055f 110 case SCMP_ARCH_ARM:
57183d11 111 return "arm";
aa34055f
ZJS
112 case SCMP_ARCH_AARCH64:
113 return "arm64";
114 case SCMP_ARCH_MIPS:
115 return "mips";
116 case SCMP_ARCH_MIPS64:
117 return "mips64";
118 case SCMP_ARCH_MIPS64N32:
119 return "mips64-n32";
120 case SCMP_ARCH_MIPSEL:
121 return "mips-le";
122 case SCMP_ARCH_MIPSEL64:
123 return "mips64-le";
124 case SCMP_ARCH_MIPSEL64N32:
125 return "mips64-le-n32";
126 case SCMP_ARCH_PPC:
127 return "ppc";
128 case SCMP_ARCH_PPC64:
129 return "ppc64";
130 case SCMP_ARCH_PPC64LE:
131 return "ppc64-le";
132 case SCMP_ARCH_S390:
6abfd303 133 return "s390";
aa34055f 134 case SCMP_ARCH_S390X:
6abfd303 135 return "s390x";
aa34055f
ZJS
136 default:
137 return NULL;
138 }
57183d11
LP
139}
140
141int seccomp_arch_from_string(const char *n, uint32_t *ret) {
142 if (!n)
143 return -EINVAL;
144
145 assert(ret);
146
147 if (streq(n, "native"))
148 *ret = SCMP_ARCH_NATIVE;
149 else if (streq(n, "x86"))
150 *ret = SCMP_ARCH_X86;
151 else if (streq(n, "x86-64"))
152 *ret = SCMP_ARCH_X86_64;
153 else if (streq(n, "x32"))
154 *ret = SCMP_ARCH_X32;
155 else if (streq(n, "arm"))
156 *ret = SCMP_ARCH_ARM;
aa34055f
ZJS
157 else if (streq(n, "arm64"))
158 *ret = SCMP_ARCH_AARCH64;
159 else if (streq(n, "mips"))
160 *ret = SCMP_ARCH_MIPS;
161 else if (streq(n, "mips64"))
162 *ret = SCMP_ARCH_MIPS64;
163 else if (streq(n, "mips64-n32"))
164 *ret = SCMP_ARCH_MIPS64N32;
165 else if (streq(n, "mips-le"))
166 *ret = SCMP_ARCH_MIPSEL;
167 else if (streq(n, "mips64-le"))
168 *ret = SCMP_ARCH_MIPSEL64;
169 else if (streq(n, "mips64-le-n32"))
170 *ret = SCMP_ARCH_MIPSEL64N32;
171 else if (streq(n, "ppc"))
172 *ret = SCMP_ARCH_PPC;
173 else if (streq(n, "ppc64"))
174 *ret = SCMP_ARCH_PPC64;
175 else if (streq(n, "ppc64-le"))
176 *ret = SCMP_ARCH_PPC64LE;
6abfd303
HB
177 else if (streq(n, "s390"))
178 *ret = SCMP_ARCH_S390;
179 else if (streq(n, "s390x"))
180 *ret = SCMP_ARCH_S390X;
57183d11
LP
181 else
182 return -EINVAL;
183
184 return 0;
185}
e9642be2 186
469830d1 187int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
8d7b0c8f
LP
188 scmp_filter_ctx seccomp;
189 int r;
190
469830d1
LP
191 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
192 * any others. Also, turns off the NNP fiddling. */
8d7b0c8f
LP
193
194 seccomp = seccomp_init(default_action);
195 if (!seccomp)
196 return -ENOMEM;
197
469830d1
LP
198 if (arch != SCMP_ARCH_NATIVE &&
199 arch != seccomp_arch_native()) {
200
1b52793d 201 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
469830d1
LP
202 if (r < 0)
203 goto finish;
204
1b52793d 205 r = seccomp_arch_add(seccomp, arch);
469830d1
LP
206 if (r < 0)
207 goto finish;
208
209 assert(seccomp_arch_exist(seccomp, arch) >= 0);
210 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
211 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
212 } else {
213 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
214 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
215 }
216
217 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
8d7b0c8f
LP
218 if (r < 0)
219 goto finish;
220
221 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
222 if (r < 0)
223 goto finish;
224
225 *ret = seccomp;
226 return 0;
227
228finish:
229 seccomp_release(seccomp);
230 return r;
231}
232
d347d902 233static bool is_basic_seccomp_available(void) {
4d5bd50a 234 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
d347d902
FS
235}
236
237static bool is_seccomp_filter_available(void) {
4d5bd50a
LP
238 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
239 errno == EFAULT;
d347d902
FS
240}
241
83f12b27 242bool is_seccomp_available(void) {
83f12b27 243 static int cached_enabled = -1;
4d5bd50a 244
83f12b27 245 if (cached_enabled < 0)
4d5bd50a
LP
246 cached_enabled =
247 is_basic_seccomp_available() &&
248 is_seccomp_filter_available();
249
83f12b27
FS
250 return cached_enabled;
251}
252
8130926d 253const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
40eb6a80 254 [SYSCALL_FILTER_SET_DEFAULT] = {
40eb6a80 255 .name = "@default",
d5efc18b 256 .help = "System calls that are always permitted",
40eb6a80
ZJS
257 .value =
258 "clock_getres\0"
259 "clock_gettime\0"
260 "clock_nanosleep\0"
261 "execve\0"
262 "exit\0"
263 "exit_group\0"
e41b0f42
LP
264 "futex\0"
265 "get_robust_list\0"
266 "get_thread_area\0"
09d3020b
DH
267 "getegid\0"
268 "getegid32\0"
269 "geteuid\0"
270 "geteuid32\0"
271 "getgid\0"
272 "getgid32\0"
273 "getgroups\0"
274 "getgroups32\0"
275 "getpgid\0"
276 "getpgrp\0"
277 "getpid\0"
278 "getppid\0"
279 "getresgid\0"
280 "getresgid32\0"
281 "getresuid\0"
282 "getresuid32\0"
40eb6a80 283 "getrlimit\0" /* make sure processes can query stack size and such */
09d3020b
DH
284 "getsid\0"
285 "gettid\0"
40eb6a80 286 "gettimeofday\0"
09d3020b
DH
287 "getuid\0"
288 "getuid32\0"
e41b0f42 289 "membarrier\0"
40eb6a80
ZJS
290 "nanosleep\0"
291 "pause\0"
4c3a9176 292 "prlimit64\0"
e41b0f42 293 "restart_syscall\0"
6fee3be0 294 "rseq\0"
40eb6a80 295 "rt_sigreturn\0"
8f44de08 296 "sched_yield\0"
e41b0f42
LP
297 "set_robust_list\0"
298 "set_thread_area\0"
299 "set_tid_address\0"
ce5faeac 300 "set_tls\0"
40eb6a80
ZJS
301 "sigreturn\0"
302 "time\0"
4c3a9176 303 "ugetrlimit\0"
40eb6a80 304 },
44898c53
LP
305 [SYSCALL_FILTER_SET_AIO] = {
306 .name = "@aio",
307 .help = "Asynchronous IO",
308 .value =
309 "io_cancel\0"
310 "io_destroy\0"
311 "io_getevents\0"
a05cfe23 312 "io_pgetevents\0"
44898c53
LP
313 "io_setup\0"
314 "io_submit\0"
315 },
133ddbbe 316 [SYSCALL_FILTER_SET_BASIC_IO] = {
133ddbbe 317 .name = "@basic-io",
d5efc18b 318 .help = "Basic IO",
133ddbbe 319 .value =
648a0ed0 320 "_llseek\0"
133ddbbe 321 "close\0"
648a0ed0 322 "dup\0"
133ddbbe
LP
323 "dup2\0"
324 "dup3\0"
133ddbbe
LP
325 "lseek\0"
326 "pread64\0"
327 "preadv\0"
44898c53 328 "preadv2\0"
133ddbbe
LP
329 "pwrite64\0"
330 "pwritev\0"
44898c53 331 "pwritev2\0"
133ddbbe
LP
332 "read\0"
333 "readv\0"
334 "write\0"
335 "writev\0"
336 },
44898c53
LP
337 [SYSCALL_FILTER_SET_CHOWN] = {
338 .name = "@chown",
339 .help = "Change ownership of files and directories",
340 .value =
341 "chown\0"
342 "chown32\0"
343 "fchown\0"
344 "fchown32\0"
345 "fchownat\0"
346 "lchown\0"
347 "lchown32\0"
348 },
8130926d 349 [SYSCALL_FILTER_SET_CLOCK] = {
8130926d 350 .name = "@clock",
d5efc18b 351 .help = "Change the system time",
201c1cc2
TM
352 .value =
353 "adjtimex\0"
1f9ac68b
LP
354 "clock_adjtime\0"
355 "clock_settime\0"
201c1cc2 356 "settimeofday\0"
1f9ac68b 357 "stime\0"
8130926d
LP
358 },
359 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
8130926d 360 .name = "@cpu-emulation",
d5efc18b 361 .help = "System calls for CPU emulation functionality",
1f9ac68b
LP
362 .value =
363 "modify_ldt\0"
364 "subpage_prot\0"
365 "switch_endian\0"
366 "vm86\0"
367 "vm86old\0"
8130926d
LP
368 },
369 [SYSCALL_FILTER_SET_DEBUG] = {
8130926d 370 .name = "@debug",
d5efc18b 371 .help = "Debugging, performance monitoring and tracing functionality",
1f9ac68b
LP
372 .value =
373 "lookup_dcookie\0"
374 "perf_event_open\0"
1f9ac68b
LP
375 "ptrace\0"
376 "rtas\0"
8130926d 377#ifdef __NR_s390_runtime_instr
1f9ac68b 378 "s390_runtime_instr\0"
8130926d 379#endif
1f9ac68b 380 "sys_debug_setcontext\0"
8130926d 381 },
1a1b13c9
LP
382 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
383 .name = "@file-system",
384 .help = "File system operations",
385 .value =
386 "access\0"
387 "chdir\0"
388 "chmod\0"
389 "close\0"
390 "creat\0"
391 "faccessat\0"
392 "fallocate\0"
393 "fchdir\0"
394 "fchmod\0"
395 "fchmodat\0"
1a1b13c9 396 "fcntl\0"
ceaa6aa7 397 "fcntl64\0"
1a1b13c9
LP
398 "fgetxattr\0"
399 "flistxattr\0"
ceaa6aa7 400 "fremovexattr\0"
1a1b13c9 401 "fsetxattr\0"
1a1b13c9 402 "fstat\0"
ceaa6aa7 403 "fstat64\0"
1a1b13c9 404 "fstatat64\0"
1a1b13c9 405 "fstatfs\0"
ceaa6aa7 406 "fstatfs64\0"
1a1b13c9 407 "ftruncate\0"
ceaa6aa7 408 "ftruncate64\0"
1a1b13c9
LP
409 "futimesat\0"
410 "getcwd\0"
1a1b13c9 411 "getdents\0"
ceaa6aa7 412 "getdents64\0"
1a1b13c9
LP
413 "getxattr\0"
414 "inotify_add_watch\0"
ceaa6aa7 415 "inotify_init\0"
1a1b13c9
LP
416 "inotify_init1\0"
417 "inotify_rm_watch\0"
418 "lgetxattr\0"
419 "link\0"
420 "linkat\0"
421 "listxattr\0"
422 "llistxattr\0"
423 "lremovexattr\0"
424 "lsetxattr\0"
1a1b13c9 425 "lstat\0"
ceaa6aa7 426 "lstat64\0"
1a1b13c9
LP
427 "mkdir\0"
428 "mkdirat\0"
429 "mknod\0"
430 "mknodat\0"
1a1b13c9 431 "mmap\0"
ceaa6aa7 432 "mmap2\0"
7961116e 433 "munmap\0"
1a1b13c9 434 "newfstatat\0"
ceaa6aa7
LP
435 "oldfstat\0"
436 "oldlstat\0"
437 "oldstat\0"
1a1b13c9
LP
438 "open\0"
439 "openat\0"
440 "readlink\0"
441 "readlinkat\0"
442 "removexattr\0"
443 "rename\0"
1a1b13c9 444 "renameat\0"
ceaa6aa7 445 "renameat2\0"
1a1b13c9
LP
446 "rmdir\0"
447 "setxattr\0"
1a1b13c9 448 "stat\0"
ceaa6aa7 449 "stat64\0"
1a1b13c9 450 "statfs\0"
ceaa6aa7 451 "statfs64\0"
8e6a7a8b 452#ifdef __NR_statx
a4135a74 453 "statx\0"
ceaa6aa7 454#endif
1a1b13c9
LP
455 "symlink\0"
456 "symlinkat\0"
1a1b13c9 457 "truncate\0"
ceaa6aa7 458 "truncate64\0"
1a1b13c9
LP
459 "unlink\0"
460 "unlinkat\0"
ceaa6aa7 461 "utime\0"
1a1b13c9
LP
462 "utimensat\0"
463 "utimes\0"
464 },
8130926d 465 [SYSCALL_FILTER_SET_IO_EVENT] = {
8130926d 466 .name = "@io-event",
d5efc18b 467 .help = "Event loop system calls",
201c1cc2
TM
468 .value =
469 "_newselect\0"
201c1cc2 470 "epoll_create\0"
215728ff 471 "epoll_create1\0"
201c1cc2
TM
472 "epoll_ctl\0"
473 "epoll_ctl_old\0"
474 "epoll_pwait\0"
475 "epoll_wait\0"
476 "epoll_wait_old\0"
201c1cc2 477 "eventfd\0"
215728ff 478 "eventfd2\0"
201c1cc2
TM
479 "poll\0"
480 "ppoll\0"
481 "pselect6\0"
482 "select\0"
8130926d
LP
483 },
484 [SYSCALL_FILTER_SET_IPC] = {
8130926d 485 .name = "@ipc",
d5efc18b
ZJS
486 .help = "SysV IPC, POSIX Message Queues or other IPC",
487 .value =
488 "ipc\0"
cd5bfd7e 489 "memfd_create\0"
201c1cc2
TM
490 "mq_getsetattr\0"
491 "mq_notify\0"
492 "mq_open\0"
493 "mq_timedreceive\0"
494 "mq_timedsend\0"
495 "mq_unlink\0"
496 "msgctl\0"
497 "msgget\0"
498 "msgrcv\0"
499 "msgsnd\0"
cd5bfd7e 500 "pipe\0"
215728ff 501 "pipe2\0"
201c1cc2
TM
502 "process_vm_readv\0"
503 "process_vm_writev\0"
504 "semctl\0"
505 "semget\0"
506 "semop\0"
507 "semtimedop\0"
508 "shmat\0"
509 "shmctl\0"
510 "shmdt\0"
511 "shmget\0"
8130926d
LP
512 },
513 [SYSCALL_FILTER_SET_KEYRING] = {
8130926d 514 .name = "@keyring",
d5efc18b 515 .help = "Kernel keyring access",
1f9ac68b
LP
516 .value =
517 "add_key\0"
518 "keyctl\0"
519 "request_key\0"
8130926d 520 },
cd0ddf6f
LP
521 [SYSCALL_FILTER_SET_MEMLOCK] = {
522 .name = "@memlock",
523 .help = "Memory locking control",
524 .value =
525 "mlock\0"
526 "mlock2\0"
527 "mlockall\0"
528 "munlock\0"
529 "munlockall\0"
530 },
8130926d 531 [SYSCALL_FILTER_SET_MODULE] = {
8130926d 532 .name = "@module",
d5efc18b 533 .help = "Loading and unloading of kernel modules",
201c1cc2 534 .value =
201c1cc2
TM
535 "delete_module\0"
536 "finit_module\0"
537 "init_module\0"
8130926d
LP
538 },
539 [SYSCALL_FILTER_SET_MOUNT] = {
8130926d 540 .name = "@mount",
d5efc18b 541 .help = "Mounting and unmounting of file systems",
201c1cc2
TM
542 .value =
543 "chroot\0"
544 "mount\0"
201c1cc2 545 "pivot_root\0"
201c1cc2 546 "umount\0"
215728ff 547 "umount2\0"
8130926d
LP
548 },
549 [SYSCALL_FILTER_SET_NETWORK_IO] = {
8130926d 550 .name = "@network-io",
d5efc18b 551 .help = "Network or Unix socket IO, should not be needed if not network facing",
201c1cc2 552 .value =
201c1cc2 553 "accept\0"
215728ff 554 "accept4\0"
201c1cc2
TM
555 "bind\0"
556 "connect\0"
557 "getpeername\0"
558 "getsockname\0"
559 "getsockopt\0"
560 "listen\0"
561 "recv\0"
562 "recvfrom\0"
563 "recvmmsg\0"
564 "recvmsg\0"
565 "send\0"
566 "sendmmsg\0"
567 "sendmsg\0"
568 "sendto\0"
569 "setsockopt\0"
570 "shutdown\0"
571 "socket\0"
572 "socketcall\0"
573 "socketpair\0"
8130926d
LP
574 },
575 [SYSCALL_FILTER_SET_OBSOLETE] = {
d5efc18b 576 /* some unknown even to libseccomp */
8130926d 577 .name = "@obsolete",
d5efc18b 578 .help = "Unusual, obsolete or unimplemented system calls",
201c1cc2
TM
579 .value =
580 "_sysctl\0"
581 "afs_syscall\0"
802fa07a 582 "bdflush\0"
201c1cc2 583 "break\0"
1f9ac68b 584 "create_module\0"
201c1cc2
TM
585 "ftime\0"
586 "get_kernel_syms\0"
201c1cc2
TM
587 "getpmsg\0"
588 "gtty\0"
7e0c3b8f 589 "idle\0"
201c1cc2 590 "lock\0"
201c1cc2 591 "mpx\0"
201c1cc2
TM
592 "prof\0"
593 "profil\0"
201c1cc2
TM
594 "putpmsg\0"
595 "query_module\0"
201c1cc2
TM
596 "security\0"
597 "sgetmask\0"
598 "ssetmask\0"
599 "stty\0"
1f9ac68b 600 "sysfs\0"
201c1cc2
TM
601 "tuxcall\0"
602 "ulimit\0"
603 "uselib\0"
1f9ac68b 604 "ustat\0"
201c1cc2 605 "vserver\0"
8130926d
LP
606 },
607 [SYSCALL_FILTER_SET_PRIVILEGED] = {
8130926d 608 .name = "@privileged",
d5efc18b 609 .help = "All system calls which need super-user capabilities",
201c1cc2 610 .value =
44898c53 611 "@chown\0"
201c1cc2
TM
612 "@clock\0"
613 "@module\0"
614 "@raw-io\0"
af0f047b
LP
615 "@reboot\0"
616 "@swap\0"
215728ff 617 "_sysctl\0"
201c1cc2 618 "acct\0"
201c1cc2 619 "bpf\0"
1f9ac68b 620 "capset\0"
201c1cc2 621 "chroot\0"
a05cfe23 622 "fanotify_init\0"
201c1cc2 623 "nfsservctl\0"
a05cfe23 624 "open_by_handle_at\0"
201c1cc2
TM
625 "pivot_root\0"
626 "quotactl\0"
201c1cc2 627 "setdomainname\0"
201c1cc2 628 "setfsuid\0"
215728ff 629 "setfsuid32\0"
201c1cc2 630 "setgroups\0"
215728ff 631 "setgroups32\0"
201c1cc2 632 "sethostname\0"
201c1cc2 633 "setresuid\0"
215728ff 634 "setresuid32\0"
201c1cc2 635 "setreuid\0"
215728ff 636 "setreuid32\0"
e05ee49b 637 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
215728ff 638 "setuid32\0"
201c1cc2 639 "vhangup\0"
8130926d
LP
640 },
641 [SYSCALL_FILTER_SET_PROCESS] = {
8130926d 642 .name = "@process",
d5efc18b 643 .help = "Process control, execution, namespaceing operations",
201c1cc2
TM
644 .value =
645 "arch_prctl\0"
09d3020b 646 "capget\0" /* Able to query arbitrary processes */
201c1cc2 647 "clone\0"
201c1cc2
TM
648 "execveat\0"
649 "fork\0"
b887d2eb 650 "getrusage\0"
201c1cc2
TM
651 "kill\0"
652 "prctl\0"
b887d2eb
LP
653 "rt_sigqueueinfo\0"
654 "rt_tgsigqueueinfo\0"
201c1cc2 655 "setns\0"
a9518dc3 656 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
201c1cc2 657 "tgkill\0"
b887d2eb 658 "times\0"
201c1cc2
TM
659 "tkill\0"
660 "unshare\0"
661 "vfork\0"
b887d2eb
LP
662 "wait4\0"
663 "waitid\0"
664 "waitpid\0"
8130926d
LP
665 },
666 [SYSCALL_FILTER_SET_RAW_IO] = {
8130926d 667 .name = "@raw-io",
d5efc18b 668 .help = "Raw I/O port access",
201c1cc2
TM
669 .value =
670 "ioperm\0"
671 "iopl\0"
1f9ac68b 672 "pciconfig_iobase\0"
201c1cc2
TM
673 "pciconfig_read\0"
674 "pciconfig_write\0"
8130926d 675#ifdef __NR_s390_pci_mmio_read
201c1cc2 676 "s390_pci_mmio_read\0"
8130926d
LP
677#endif
678#ifdef __NR_s390_pci_mmio_write
201c1cc2 679 "s390_pci_mmio_write\0"
8130926d
LP
680#endif
681 },
bd2ab3f4
LP
682 [SYSCALL_FILTER_SET_REBOOT] = {
683 .name = "@reboot",
684 .help = "Reboot and reboot preparation/kexec",
685 .value =
bd2ab3f4 686 "kexec_file_load\0"
e59608fa 687 "kexec_load\0"
bd2ab3f4
LP
688 "reboot\0"
689 },
133ddbbe 690 [SYSCALL_FILTER_SET_RESOURCES] = {
133ddbbe 691 .name = "@resources",
58a8f68b 692 .help = "Alter resource settings",
133ddbbe 693 .value =
0963c053
LP
694 "ioprio_set\0"
695 "mbind\0"
696 "migrate_pages\0"
697 "move_pages\0"
698 "nice\0"
0963c053
LP
699 "sched_setaffinity\0"
700 "sched_setattr\0"
133ddbbe
LP
701 "sched_setparam\0"
702 "sched_setscheduler\0"
0963c053 703 "set_mempolicy\0"
133ddbbe
LP
704 "setpriority\0"
705 "setrlimit\0"
133ddbbe 706 },
6eaaeee9
LP
707 [SYSCALL_FILTER_SET_SETUID] = {
708 .name = "@setuid",
709 .help = "Operations for changing user/group credentials",
710 .value =
6eaaeee9 711 "setgid\0"
215728ff 712 "setgid32\0"
6eaaeee9 713 "setgroups\0"
215728ff 714 "setgroups32\0"
6eaaeee9 715 "setregid\0"
215728ff 716 "setregid32\0"
6eaaeee9 717 "setresgid\0"
215728ff 718 "setresgid32\0"
6eaaeee9 719 "setresuid\0"
215728ff 720 "setresuid32\0"
6eaaeee9 721 "setreuid\0"
215728ff 722 "setreuid32\0"
6eaaeee9 723 "setuid\0"
215728ff 724 "setuid32\0"
6eaaeee9 725 },
cd0ddf6f
LP
726 [SYSCALL_FILTER_SET_SIGNAL] = {
727 .name = "@signal",
728 .help = "Process signal handling",
729 .value =
730 "rt_sigaction\0"
731 "rt_sigpending\0"
732 "rt_sigprocmask\0"
733 "rt_sigsuspend\0"
734 "rt_sigtimedwait\0"
735 "sigaction\0"
736 "sigaltstack\0"
737 "signal\0"
738 "signalfd\0"
739 "signalfd4\0"
740 "sigpending\0"
741 "sigprocmask\0"
742 "sigsuspend\0"
743 },
bd2ab3f4
LP
744 [SYSCALL_FILTER_SET_SWAP] = {
745 .name = "@swap",
746 .help = "Enable/disable swap devices",
747 .value =
748 "swapoff\0"
749 "swapon\0"
750 },
44898c53
LP
751 [SYSCALL_FILTER_SET_SYNC] = {
752 .name = "@sync",
753 .help = "Synchronize files and memory to storage",
754 .value =
755 "fdatasync\0"
756 "fsync\0"
757 "msync\0"
758 "sync\0"
759 "sync_file_range\0"
760 "syncfs\0"
761 },
70526841
LP
762 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
763 .name = "@system-service",
764 .help = "General system service operations",
765 .value =
766 "@aio\0"
767 "@basic-io\0"
768 "@chown\0"
769 "@default\0"
770 "@file-system\0"
771 "@io-event\0"
772 "@ipc\0"
773 "@keyring\0"
774 "@memlock\0"
775 "@network-io\0"
776 "@process\0"
777 "@resources\0"
778 "@setuid\0"
779 "@signal\0"
780 "@sync\0"
781 "@timer\0"
782 "brk\0"
783 "capget\0"
784 "capset\0"
785 "copy_file_range\0"
786 "fadvise64\0"
787 "fadvise64_64\0"
788 "flock\0"
789 "get_mempolicy\0"
790 "getcpu\0"
791 "getpriority\0"
792 "getrandom\0"
793 "ioctl\0"
794 "ioprio_get\0"
795 "kcmp\0"
796 "madvise\0"
70526841
LP
797 "mprotect\0"
798 "mremap\0"
799 "name_to_handle_at\0"
800 "oldolduname\0"
801 "olduname\0"
802 "personality\0"
803 "readahead\0"
804 "readdir\0"
805 "remap_file_pages\0"
806 "sched_get_priority_max\0"
807 "sched_get_priority_min\0"
808 "sched_getaffinity\0"
809 "sched_getattr\0"
810 "sched_getparam\0"
811 "sched_getscheduler\0"
812 "sched_rr_get_interval\0"
813 "sched_yield\0"
814 "sendfile\0"
815 "sendfile64\0"
816 "setfsgid\0"
817 "setfsgid32\0"
818 "setfsuid\0"
819 "setfsuid32\0"
820 "setpgid\0"
821 "setsid\0"
822 "splice\0"
823 "sysinfo\0"
824 "tee\0"
825 "umask\0"
826 "uname\0"
827 "userfaultfd\0"
828 "vmsplice\0"
829 },
cd0ddf6f
LP
830 [SYSCALL_FILTER_SET_TIMER] = {
831 .name = "@timer",
832 .help = "Schedule operations by time",
833 .value =
834 "alarm\0"
835 "getitimer\0"
836 "setitimer\0"
837 "timer_create\0"
838 "timer_delete\0"
839 "timer_getoverrun\0"
840 "timer_gettime\0"
841 "timer_settime\0"
842 "timerfd_create\0"
843 "timerfd_gettime\0"
844 "timerfd_settime\0"
845 "times\0"
846 },
201c1cc2 847};
8130926d
LP
848
849const SyscallFilterSet *syscall_filter_set_find(const char *name) {
850 unsigned i;
851
852 if (isempty(name) || name[0] != '@')
853 return NULL;
854
855 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
856 if (streq(syscall_filter_sets[i].name, name))
857 return syscall_filter_sets + i;
858
859 return NULL;
860}
861
b54f36c6 862static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
69b1b241 863
b54f36c6 864int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
69b1b241
LP
865 assert(seccomp);
866 assert(name);
867
960e4569
LP
868 if (strv_contains(exclude, name))
869 return 0;
870
69b1b241
LP
871 if (name[0] == '@') {
872 const SyscallFilterSet *other;
873
874 other = syscall_filter_set_find(name);
baaa35ad
ZJS
875 if (!other)
876 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
877 "Filter set %s is not known!",
878 name);
69b1b241 879
b54f36c6
ZJS
880 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
881
69b1b241 882 } else {
b54f36c6 883 int id, r;
69b1b241
LP
884
885 id = seccomp_syscall_resolve_name(name);
cff7bff8 886 if (id == __NR_SCMP_ERROR) {
b54f36c6
ZJS
887 if (log_missing)
888 log_debug("System call %s is not known, ignoring.", name);
ff217dc3 889 return 0;
cff7bff8 890 }
69b1b241
LP
891
892 r = seccomp_rule_add_exact(seccomp, action, id, 0);
b54f36c6 893 if (r < 0) {
69b1b241 894 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
7e86bd73
ZJS
895 bool ignore = r == -EDOM;
896
897 if (!ignore || log_missing)
898 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
899 name, id, ignore ? ", ignoring" : "");
900 if (!ignore)
901 return r;
b54f36c6 902 }
69b1b241 903
b54f36c6
ZJS
904 return 0;
905 }
69b1b241
LP
906}
907
469830d1
LP
908static int seccomp_add_syscall_filter_set(
909 scmp_filter_ctx seccomp,
469830d1 910 const SyscallFilterSet *set,
960e4569 911 uint32_t action,
b54f36c6
ZJS
912 char **exclude,
913 bool log_missing) {
469830d1 914
8130926d
LP
915 const char *sys;
916 int r;
917
918 assert(seccomp);
919 assert(set);
920
921 NULSTR_FOREACH(sys, set->value) {
b54f36c6 922 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
69b1b241
LP
923 if (r < 0)
924 return r;
469830d1
LP
925 }
926
927 return 0;
928}
929
b54f36c6 930int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
469830d1
LP
931 uint32_t arch;
932 int r;
933
934 assert(set);
935
936 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
a90db619 937 * each local arch. */
469830d1
LP
938
939 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
940 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
941
942 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
943
944 r = seccomp_init_for_arch(&seccomp, arch, default_action);
8130926d
LP
945 if (r < 0)
946 return r;
469830d1 947
b54f36c6 948 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
7e86bd73
ZJS
949 if (r < 0)
950 return log_debug_errno(r, "Failed to add filter set: %m");
469830d1
LP
951
952 r = seccomp_load(seccomp);
953 if (IN_SET(r, -EPERM, -EACCES))
954 return r;
955 if (r < 0)
956 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
8130926d
LP
957 }
958
959 return 0;
960}
a3be2849 961
b54f36c6 962int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
469830d1 963 uint32_t arch;
a3be2849
LP
964 int r;
965
469830d1
LP
966 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
967 * SyscallFilterSet* table. */
a3be2849 968
8cfa775f 969 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
469830d1 970 return 0;
a3be2849 971
469830d1
LP
972 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
973 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
974 Iterator i;
b54f36c6 975 void *syscall_id, *val;
a3be2849 976
469830d1 977 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
a3be2849 978
469830d1
LP
979 r = seccomp_init_for_arch(&seccomp, arch, default_action);
980 if (r < 0)
981 return r;
a3be2849 982
b54f36c6 983 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
8cfa775f 984 uint32_t a = action;
b54f36c6
ZJS
985 int id = PTR_TO_INT(syscall_id) - 1;
986 int error = PTR_TO_INT(val);
8cfa775f 987
b54f36c6
ZJS
988 if (action != SCMP_ACT_ALLOW && error >= 0)
989 a = SCMP_ACT_ERRNO(error);
8cfa775f 990
b54f36c6 991 r = seccomp_rule_add_exact(seccomp, a, id, 0);
469830d1
LP
992 if (r < 0) {
993 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
994 _cleanup_free_ char *n = NULL;
7e86bd73 995 bool ignore;
469830d1 996
b54f36c6 997 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
7e86bd73
ZJS
998 ignore = r == -EDOM;
999 if (!ignore || log_missing)
1000 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1001 strna(n), id, ignore ? ", ignoring" : "");
1002 if (!ignore)
1003 return r;
469830d1
LP
1004 }
1005 }
1006
1007 r = seccomp_load(seccomp);
1008 if (IN_SET(r, -EPERM, -EACCES))
1009 return r;
1010 if (r < 0)
1011 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1012 }
1013
1014 return 0;
add00535
LP
1015}
1016
13d92c63 1017int seccomp_parse_syscall_filter_full(
898748d8
YW
1018 const char *name,
1019 int errno_num,
1020 Hashmap *filter,
13d92c63 1021 SeccompParseFlags flags,
898748d8
YW
1022 const char *unit,
1023 const char *filename,
1024 unsigned line) {
1025
1026 int r;
1027
1028 assert(name);
1029 assert(filter);
1030
1031 if (name[0] == '@') {
1032 const SyscallFilterSet *set;
1033 const char *i;
1034
1035 set = syscall_filter_set_find(name);
1036 if (!set) {
13d92c63 1037 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1038 return -EINVAL;
13d92c63
LP
1039
1040 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1041 "Unknown system call group, ignoring: %s", name);
1042 return 0;
898748d8
YW
1043 }
1044
1045 NULSTR_FOREACH(i, set->value) {
13d92c63
LP
1046 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1047 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1048 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1049 * about them. */
1050 r = seccomp_parse_syscall_filter_full(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
898748d8
YW
1051 if (r < 0)
1052 return r;
1053 }
1054 } else {
1055 int id;
1056
1057 id = seccomp_syscall_resolve_name(name);
1058 if (id == __NR_SCMP_ERROR) {
13d92c63 1059 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
898748d8 1060 return -EINVAL;
13d92c63
LP
1061
1062 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1063 "Failed to parse system call, ignoring: %s", name);
1064 return 0;
898748d8
YW
1065 }
1066
1067 /* If we previously wanted to forbid a syscall and now
1068 * we want to allow it, then remove it from the list. */
13d92c63 1069 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
898748d8
YW
1070 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1071 if (r < 0)
851ee70a
LW
1072 switch (r) {
1073 case -ENOMEM:
1074 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1075 case -EEXIST:
9d7fe7c6
LW
1076 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1077 break;
851ee70a
LW
1078 default:
1079 return r;
1080 }
898748d8
YW
1081 } else
1082 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1083 }
1084
1085 return 0;
1086}
1087
add00535 1088int seccomp_restrict_namespaces(unsigned long retain) {
469830d1 1089 uint32_t arch;
add00535
LP
1090 int r;
1091
f1d34068 1092 if (DEBUG_LOGGING) {
add00535
LP
1093 _cleanup_free_ char *s = NULL;
1094
86c2a9f1 1095 (void) namespace_flags_to_string(retain, &s);
add00535
LP
1096 log_debug("Restricting namespace to: %s.", strna(s));
1097 }
1098
1099 /* NOOP? */
1100 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1101 return 0;
1102
469830d1
LP
1103 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1104 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1105 unsigned i;
add00535 1106
469830d1
LP
1107 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1108
1109 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1110 if (r < 0)
1111 return r;
1112
1113 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1114 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1115 * altogether. */
1116 r = seccomp_rule_add_exact(
1117 seccomp,
1118 SCMP_ACT_ERRNO(EPERM),
1119 SCMP_SYS(setns),
1120 0);
1121 else
1122 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1123 * special invocation with a zero flags argument, right here. */
1124 r = seccomp_rule_add_exact(
1125 seccomp,
1126 SCMP_ACT_ERRNO(EPERM),
1127 SCMP_SYS(setns),
1128 1,
1129 SCMP_A1(SCMP_CMP_EQ, 0));
1130 if (r < 0) {
1131 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1132 continue;
1133 }
1134
1135 for (i = 0; namespace_flag_map[i].name; i++) {
1136 unsigned long f;
1137
1138 f = namespace_flag_map[i].flag;
1139 if ((retain & f) == f) {
1140 log_debug("Permitting %s.", namespace_flag_map[i].name);
1141 continue;
1142 }
1143
1144 log_debug("Blocking %s.", namespace_flag_map[i].name);
1145
1146 r = seccomp_rule_add_exact(
1147 seccomp,
1148 SCMP_ACT_ERRNO(EPERM),
1149 SCMP_SYS(unshare),
1150 1,
1151 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1152 if (r < 0) {
1153 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1154 break;
1155 }
1156
511ceb1f
ZJS
1157 /* On s390/s390x the first two parameters to clone are switched */
1158 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
ae9d60ce
LP
1159 r = seccomp_rule_add_exact(
1160 seccomp,
1161 SCMP_ACT_ERRNO(EPERM),
1162 SCMP_SYS(clone),
1163 1,
1164 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1165 else
1166 r = seccomp_rule_add_exact(
1167 seccomp,
1168 SCMP_ACT_ERRNO(EPERM),
1169 SCMP_SYS(clone),
1170 1,
1171 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
469830d1
LP
1172 if (r < 0) {
1173 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1174 break;
1175 }
1176
1177 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1178 r = seccomp_rule_add_exact(
1179 seccomp,
1180 SCMP_ACT_ERRNO(EPERM),
1181 SCMP_SYS(setns),
1182 1,
1183 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1184 if (r < 0) {
1185 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1186 break;
1187 }
1188 }
1189 }
1190 if (r < 0)
1191 continue;
1192
1193 r = seccomp_load(seccomp);
1194 if (IN_SET(r, -EPERM, -EACCES))
1195 return r;
1196 if (r < 0)
1197 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1198 }
1199
1200 return 0;
1201}
1202
1203int seccomp_protect_sysctl(void) {
1204 uint32_t arch;
1205 int r;
1206
1207 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1208 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1209
1210 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1211
2e64e8f4
ZJS
1212 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1213 /* No _sysctl syscall */
1214 continue;
1215
469830d1
LP
1216 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1217 if (r < 0)
1218 return r;
1219
1220 r = seccomp_rule_add_exact(
add00535
LP
1221 seccomp,
1222 SCMP_ACT_ERRNO(EPERM),
469830d1 1223 SCMP_SYS(_sysctl),
add00535 1224 0);
469830d1
LP
1225 if (r < 0) {
1226 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1227 continue;
1228 }
1229
1230 r = seccomp_load(seccomp);
1231 if (IN_SET(r, -EPERM, -EACCES))
1232 return r;
1233 if (r < 0)
1234 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1235 }
1236
1237 return 0;
1238}
1239
1240int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1241 uint32_t arch;
1242 int r;
1243
1244 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1245 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
9606bc4b 1246 bool supported;
469830d1
LP
1247 Iterator i;
1248
1249 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1250
9606bc4b
LP
1251 switch (arch) {
1252
1253 case SCMP_ARCH_X86_64:
1254 case SCMP_ARCH_X32:
1255 case SCMP_ARCH_ARM:
1256 case SCMP_ARCH_AARCH64:
0d9fca76 1257 case SCMP_ARCH_PPC:
da1921a5
ZJS
1258 case SCMP_ARCH_PPC64:
1259 case SCMP_ARCH_PPC64LE:
f5aeac14
JC
1260 case SCMP_ARCH_MIPSEL64N32:
1261 case SCMP_ARCH_MIPS64N32:
1262 case SCMP_ARCH_MIPSEL64:
1263 case SCMP_ARCH_MIPS64:
9606bc4b
LP
1264 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1265 supported = true;
1266 break;
1267
9606bc4b
LP
1268 case SCMP_ARCH_S390:
1269 case SCMP_ARCH_S390X:
da1921a5 1270 case SCMP_ARCH_X86:
f5aeac14
JC
1271 case SCMP_ARCH_MIPSEL:
1272 case SCMP_ARCH_MIPS:
9606bc4b
LP
1273 default:
1274 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1275 * don't know */
1276 supported = false;
1277 break;
1278 }
1279
1280 if (!supported)
1281 continue;
1282
469830d1
LP
1283 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1284 if (r < 0)
1285 return r;
1286
1287 if (whitelist) {
1288 int af, first = 0, last = 0;
1289 void *afp;
1290
1291 /* If this is a whitelist, we first block the address families that are out of range and then
1292 * everything that is not in the set. First, we find the lowest and highest address family in
1293 * the set. */
1294
1295 SET_FOREACH(afp, address_families, i) {
1296 af = PTR_TO_INT(afp);
1297
1298 if (af <= 0 || af >= af_max())
1299 continue;
1300
1301 if (first == 0 || af < first)
1302 first = af;
1303
1304 if (last == 0 || af > last)
1305 last = af;
1306 }
1307
1308 assert((first == 0) == (last == 0));
1309
1310 if (first == 0) {
1311
1312 /* No entries in the valid range, block everything */
1313 r = seccomp_rule_add_exact(
1314 seccomp,
1315 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1316 SCMP_SYS(socket),
1317 0);
1318 if (r < 0) {
1319 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1320 continue;
1321 }
1322
1323 } else {
1324
1325 /* Block everything below the first entry */
1326 r = seccomp_rule_add_exact(
1327 seccomp,
1328 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1329 SCMP_SYS(socket),
1330 1,
1331 SCMP_A0(SCMP_CMP_LT, first));
1332 if (r < 0) {
1333 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1334 continue;
1335 }
1336
1337 /* Block everything above the last entry */
1338 r = seccomp_rule_add_exact(
1339 seccomp,
1340 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1341 SCMP_SYS(socket),
1342 1,
1343 SCMP_A0(SCMP_CMP_GT, last));
1344 if (r < 0) {
1345 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1346 continue;
1347 }
1348
1349 /* Block everything between the first and last entry */
1350 for (af = 1; af < af_max(); af++) {
1351
1352 if (set_contains(address_families, INT_TO_PTR(af)))
1353 continue;
1354
1355 r = seccomp_rule_add_exact(
1356 seccomp,
1357 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1358 SCMP_SYS(socket),
1359 1,
1360 SCMP_A0(SCMP_CMP_EQ, af));
1361 if (r < 0)
1362 break;
1363 }
469830d1
LP
1364 if (r < 0) {
1365 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1366 continue;
1367 }
1368 }
1369
1370 } else {
1371 void *af;
1372
1373 /* If this is a blacklist, then generate one rule for
1374 * each address family that are then combined in OR
1375 * checks. */
1376
1377 SET_FOREACH(af, address_families, i) {
1378
1379 r = seccomp_rule_add_exact(
1380 seccomp,
1381 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1382 SCMP_SYS(socket),
1383 1,
1384 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1385 if (r < 0)
1386 break;
1387 }
469830d1
LP
1388 if (r < 0) {
1389 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1390 continue;
1391 }
1392 }
1393
1394 r = seccomp_load(seccomp);
1395 if (IN_SET(r, -EPERM, -EACCES))
1396 return r;
1397 if (r < 0)
1398 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1399 }
1400
1401 return 0;
1402}
1403
1404int seccomp_restrict_realtime(void) {
1405 static const int permitted_policies[] = {
1406 SCHED_OTHER,
1407 SCHED_BATCH,
1408 SCHED_IDLE,
1409 };
1410
1411 int r, max_policy = 0;
1412 uint32_t arch;
1413 unsigned i;
1414
1415 /* Determine the highest policy constant we want to allow */
1416 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1417 if (permitted_policies[i] > max_policy)
1418 max_policy = permitted_policies[i];
1419
1420 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1421 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1422 int p;
1423
1424 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1425
1426 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1427 if (r < 0)
1428 return r;
1429
1430 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1431 * whitelist. */
1432 for (p = 0; p < max_policy; p++) {
1433 bool good = false;
1434
1435 /* Check if this is in the whitelist. */
1436 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1437 if (permitted_policies[i] == p) {
1438 good = true;
1439 break;
1440 }
1441
1442 if (good)
1443 continue;
1444
1445 /* Deny this policy */
1446 r = seccomp_rule_add_exact(
1447 seccomp,
1448 SCMP_ACT_ERRNO(EPERM),
1449 SCMP_SYS(sched_setscheduler),
1450 1,
1451 SCMP_A1(SCMP_CMP_EQ, p));
1452 if (r < 0) {
1453 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1454 continue;
1455 }
1456 }
1457
1458 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1459 * unsigned here, hence no need no check for < 0 values. */
1460 r = seccomp_rule_add_exact(
add00535
LP
1461 seccomp,
1462 SCMP_ACT_ERRNO(EPERM),
469830d1 1463 SCMP_SYS(sched_setscheduler),
add00535 1464 1,
469830d1
LP
1465 SCMP_A1(SCMP_CMP_GT, max_policy));
1466 if (r < 0) {
1467 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1468 continue;
1469 }
add00535 1470
469830d1
LP
1471 r = seccomp_load(seccomp);
1472 if (IN_SET(r, -EPERM, -EACCES))
1473 return r;
1474 if (r < 0)
1475 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1476 }
1477
1478 return 0;
1479}
1480
6dc66688
ZJS
1481static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1482 uint32_t arch,
1483 int nr,
14cb109d 1484 unsigned arg_cnt,
6dc66688
ZJS
1485 const struct scmp_arg_cmp arg) {
1486 int r;
1487
1488 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1489 if (r < 0) {
1490 _cleanup_free_ char *n = NULL;
1491
1492 n = seccomp_syscall_resolve_num_arch(arch, nr);
1493 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1494 strna(n),
1495 seccomp_arch_to_string(arch));
1496 }
1497
1498 return r;
1499}
1500
2a8d6e63 1501/* For known architectures, check that syscalls are indeed defined or not. */
303d6b4c 1502#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
2a8d6e63
ZJS
1503assert_cc(SCMP_SYS(shmget) > 0);
1504assert_cc(SCMP_SYS(shmat) > 0);
1505assert_cc(SCMP_SYS(shmdt) > 0);
2a8d6e63 1506#endif
6dc66688 1507
469830d1
LP
1508int seccomp_memory_deny_write_execute(void) {
1509 uint32_t arch;
1510 int r;
1511
1512 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1513 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
8a50cf69 1514 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
add00535 1515
469830d1
LP
1516 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1517
8a50cf69
LP
1518 switch (arch) {
1519
1520 case SCMP_ARCH_X86:
1521 filter_syscall = SCMP_SYS(mmap2);
1522 block_syscall = SCMP_SYS(mmap);
67fb5f33 1523 shmat_syscall = SCMP_SYS(shmat);
2a8d6e63
ZJS
1524 break;
1525
63d00dfb 1526 case SCMP_ARCH_PPC:
2a8d6e63
ZJS
1527 case SCMP_ARCH_PPC64:
1528 case SCMP_ARCH_PPC64LE:
1529 filter_syscall = SCMP_SYS(mmap);
1530
1531 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1532 * We ignore that here, which means there's still a way to get writable/executable
1533 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
8a50cf69 1534
8a50cf69
LP
1535 break;
1536
4278d1f5
ZJS
1537 case SCMP_ARCH_ARM:
1538 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1539 shmat_syscall = SCMP_SYS(shmat);
1540 break;
1541
8a50cf69
LP
1542 case SCMP_ARCH_X86_64:
1543 case SCMP_ARCH_X32:
79873bc8 1544 case SCMP_ARCH_AARCH64:
303d6b4c 1545 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
8a50cf69
LP
1546 shmat_syscall = SCMP_SYS(shmat);
1547 break;
1548
1549 /* Please add more definitions here, if you port systemd to other architectures! */
1550
303d6b4c 1551#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
8a50cf69
LP
1552#warning "Consider adding the right mmap() syscall definitions here!"
1553#endif
1554 }
1555
1556 /* Can't filter mmap() on this arch, then skip it */
1557 if (filter_syscall == 0)
1558 continue;
1559
469830d1
LP
1560 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1561 if (r < 0)
1562 return r;
1563
6dc66688
ZJS
1564 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1565 1,
1566 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1567 if (r < 0)
1568 continue;
8a50cf69
LP
1569
1570 if (block_syscall != 0) {
6dc66688
ZJS
1571 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1572 if (r < 0)
8a50cf69 1573 continue;
add00535 1574 }
a3be2849 1575
6dc66688
ZJS
1576 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1577 1,
b835eeb4
ZJS
1578 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1579 if (r < 0)
1580 continue;
1581
91691f1d 1582#ifdef __NR_pkey_mprotect
b835eeb4
ZJS
1583 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1584 1,
6dc66688
ZJS
1585 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1586 if (r < 0)
469830d1 1587 continue;
91691f1d 1588#endif
add00535 1589
67fb5f33 1590 if (shmat_syscall > 0) {
6dc66688
ZJS
1591 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1592 1,
1593 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1594 if (r < 0)
8a50cf69 1595 continue;
469830d1
LP
1596 }
1597
1598 r = seccomp_load(seccomp);
1599 if (IN_SET(r, -EPERM, -EACCES))
1600 return r;
add00535 1601 if (r < 0)
469830d1
LP
1602 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1603 }
add00535 1604
469830d1
LP
1605 return 0;
1606}
1607
1608int seccomp_restrict_archs(Set *archs) {
1609 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1610 Iterator i;
1611 void *id;
1612 int r;
1613
1614 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
2428aaf8
AJ
1615 * list.
1616 *
1617 * There are some qualifications. However the most important use is to stop processes from bypassing
1618 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1619 * in a non-native architecture. There are no holes in this use case, at least so far. */
469830d1 1620
2428aaf8
AJ
1621 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1622 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1623 * to run a program with the restrictions applied. */
469830d1
LP
1624 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1625 if (!seccomp)
1626 return -ENOMEM;
1627
1628 SET_FOREACH(id, archs, i) {
1629 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
2428aaf8
AJ
1630 if (r < 0 && r != -EEXIST)
1631 return r;
1632 }
1633
1634 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1635 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1636 * The important thing is that you can block the old 32-bit x86 syscalls.
1637 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1638
1639 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1640 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1641
1642 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1643 if (r < 0 && r != -EEXIST)
469830d1 1644 return r;
add00535
LP
1645 }
1646
469830d1
LP
1647 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1648 if (r < 0)
1649 return r;
add00535 1650
1c6af69b
LP
1651 r = seccomp_load(seccomp);
1652 if (IN_SET(r, -EPERM, -EACCES))
1653 return r;
1654 if (r < 0)
1655 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1656
1657 return 0;
a3be2849 1658}
b16bd535
YW
1659
1660int parse_syscall_archs(char **l, Set **archs) {
1661 _cleanup_set_free_ Set *_archs;
1662 char **s;
1663 int r;
1664
1665 assert(l);
1666 assert(archs);
1667
1668 r = set_ensure_allocated(&_archs, NULL);
1669 if (r < 0)
1670 return r;
1671
1672 STRV_FOREACH(s, l) {
1673 uint32_t a;
1674
1675 r = seccomp_arch_from_string(*s, &a);
1676 if (r < 0)
1677 return -EINVAL;
1678
1679 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1680 if (r < 0)
1681 return -ENOMEM;
1682 }
1683
1cc6c93a 1684 *archs = TAKE_PTR(_archs);
b16bd535
YW
1685
1686 return 0;
1687}
165a31c0 1688
8cfa775f 1689int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
165a31c0
LP
1690 const char *i;
1691 int r;
1692
1693 assert(set);
1694
1695 NULSTR_FOREACH(i, set->value) {
1696
1697 if (i[0] == '@') {
1698 const SyscallFilterSet *more;
1699
1700 more = syscall_filter_set_find(i);
1701 if (!more)
1702 return -ENXIO;
1703
165a31c0
LP
1704 r = seccomp_filter_set_add(filter, add, more);
1705 if (r < 0)
1706 return r;
1707 } else {
1708 int id;
1709
1710 id = seccomp_syscall_resolve_name(i);
ff217dc3
LP
1711 if (id == __NR_SCMP_ERROR) {
1712 log_debug("Couldn't resolve system call, ignoring: %s", i);
1713 continue;
1714 }
165a31c0
LP
1715
1716 if (add) {
8cfa775f 1717 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
165a31c0
LP
1718 if (r < 0)
1719 return r;
1720 } else
8cfa775f 1721 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
165a31c0
LP
1722 }
1723 }
1724
1725 return 0;
1726}
78e864e5
TM
1727
1728int seccomp_lock_personality(unsigned long personality) {
72eafe71 1729 uint32_t arch;
78e864e5
TM
1730 int r;
1731
72eafe71
LP
1732 if (personality >= PERSONALITY_INVALID)
1733 return -EINVAL;
78e864e5 1734
72eafe71
LP
1735 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1736 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
78e864e5 1737
72eafe71
LP
1738 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1739 if (r < 0)
1740 return r;
1741
1742 r = seccomp_rule_add_exact(
1743 seccomp,
1744 SCMP_ACT_ERRNO(EPERM),
1745 SCMP_SYS(personality),
1746 1,
1747 SCMP_A0(SCMP_CMP_NE, personality));
448ac526
LP
1748 if (r < 0) {
1749 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1750 continue;
1751 }
72eafe71
LP
1752
1753 r = seccomp_load(seccomp);
1754 if (IN_SET(r, -EPERM, -EACCES))
1755 return r;
1756 if (r < 0)
1757 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1758 }
1759
1760 return 0;
78e864e5 1761}
aecd5ac6
TM
1762
1763int seccomp_protect_hostname(void) {
1764 uint32_t arch;
1765 int r;
1766
1767 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1768 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1769
1770 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1771 if (r < 0)
1772 return r;
1773
1774 r = seccomp_rule_add_exact(
1775 seccomp,
1776 SCMP_ACT_ERRNO(EPERM),
1777 SCMP_SYS(sethostname),
1778 0);
1779 if (r < 0)
1780 continue;
1781
1782 r = seccomp_rule_add_exact(
1783 seccomp,
1784 SCMP_ACT_ERRNO(EPERM),
1785 SCMP_SYS(setdomainname),
1786 0);
1787 if (r < 0)
1788 continue;
1789
1790 r = seccomp_load(seccomp);
1791 if (IN_SET(r, -EPERM, -EACCES))
1792 return r;
1793 if (r < 0)
1794 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1795 }
1796
1797 return 0;
1798}